In [0]:
# Execution time
import time
start = time.time()

# Google Colab Setting

In [0]:
import os
import json
import numpy as np
import pandas as pd

import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.ensemble as ensemble
import sklearn.neighbors as neighbors
import sklearn.naive_bayes as naive_bayes
import sklearn.linear_model as linear_model

from sklearn.impute import SimpleImputer
from sklearn import preprocessing as preproc
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, mean_absolute_error, roc_curve, auc

from google.colab import files

In [0]:
# Accessing Google sheets
!pip install --upgrade -q gspread
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

worksheet = gc.open('AutoKaggle').worksheet('Metadata')

# get_all_values gives a list of rows
_rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
import pandas as pd
rows = pd.DataFrame.from_records(_rows)

new_header = rows.iloc[0] #grab the first row for the header
rows = rows[1:] #take the data less the header row
rows.columns = new_header #set the header row as the df header

In [0]:
def alpha_to_number(alpha_key):
  return sum([(ord(alpha)-64)*(26**ind) for ind, alpha in enumerate(list(alpha_key)[::-1])]) - 1

# Mount at Google Drive

If cannot read from the file,  please rerun this statement until "gdrive/My Drive" appears on the left bar

In [0]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Metadata Parsing

In [0]:
def parseMetaData(row_id):
  
  # Parse data from MetaData for each row
  column_key = {'name': 'C', 'columns': 'W', 'estimator_func_call': 'AU', 'target_name': 'AC', 'output_type': 'AA', 'performance_metric': 'BB', 'feature_selector': 'AL'}
  column_key = dict(map(lambda kv: (kv[0], alpha_to_number(kv[1])), column_key.items()))
  
  metadata['competition_name'] = rows.loc[row_id][column_key['name']]
  metadata['estimator'] = rows.loc[row_id][column_key['estimator_func_call']]
  metadata['target_column'] = rows.loc[row_id][column_key['target_name']]
  metadata['output_type'] = rows.loc[row_id][column_key['output_type']].split(',')
  metadata['metric'] = rows.loc[row_id][column_key['performance_metric']]
  metadata['feature_selector'] = rows.loc[row_id][column_key['feature_selector']]
  columns = rows.loc[row_id][column_key['columns']]

  # Parse column information 
  numeric_columns = []
  unwanted_columns = []
  categorical_columns = []
  columns_data = [x.strip() for x in columns[1:-1].split(';')]
  for ind, val in enumerate(columns_data):
    if ind%3 == 2:
      if (val == "numeric" or val == "integer" or val == "real"):
        numeric_columns.append(columns_data[ind-1])
      elif val == "categorical":
        categorical_columns.append(columns_data[ind-1])
      elif val == "unwanted" or val == "string" or val == 'dateTime':
        unwanted_columns.append(columns_data[ind-1])
    else:
      pass
    
  metadata['numeric_columns'] = numeric_columns
  metadata['unwanted_columns'] = unwanted_columns
  metadata['categorical_columns'] = categorical_columns
  
  # Remove target from features columns
  if metadata['target_column'] in metadata['numeric_columns']:
    metadata['numeric_columns'].remove(metadata['target_column'])
  if metadata['target_column'] in metadata['categorical_columns']:
    metadata['categorical_columns'].remove(metadata['target_column'])
  if metadata['target_column'] in metadata['unwanted_columns']:
    metadata['unwanted_columns'].remove(metadata['target_column'])
  
  print(metadata['competition_name'])
  print(metadata['numeric_columns'])
  print(metadata['categorical_columns'])
  print(metadata['unwanted_columns'])
  print(metadata['target_column'])
  print(metadata['metric'])
  print(metadata['feature_selector'])
  print(metadata['estimator'])

# Add relevent import

In [0]:
# Installations
!pip install --upgrade -q gspread
from google.colab import auth
from google.colab import files
from oauth2client.client import GoogleCredentials
import gspread
import os
import json
import warnings
import random
from math import exp
warnings.filterwarnings('ignore')

# Imports
# Preprocessing imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline
import sklearn.ensemble as ensemble
from sklearn import preprocessing as preproc
from sklearn import preprocessing
import string
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Feature extraction imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Feature selection imports

# Estimation imports
from sklearn.metrics import accuracy_score, log_loss,mean_squared_error
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn import model_selection, preprocessing, linear_model
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
import itertools
from keras.utils.np_utils import to_categorical 
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization
import scipy
import re

# Other initializations
sns.set(style='white', context='notebook', palette='deep')
epochs_completed = 0
index_in_epoch = 0

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
# Mount Google Drive
drive.mount('/gdrive')

Mounted at /gdrive


# Preprocessing

In [0]:
def preprocessing(train_df, test_df):
  y_test_age=0
  y_train_age=0
  test=0
  test_data=0
  
  if train_df.shape[0]==42000:           #digit
    # drop target columns
    X = train_df.drop(labels = ["label"],axis = 1)
    y = train_df["label"] 
    X = X / 255.0
    test_df = test_df / 255.0  
    # Reshape image in 3 dimensions (height = 28px, width = 28px , canal = 1)
    X = X.values.reshape(-1,28,28,1)
    test = test_df.values.reshape(-1,28,28,1)  
    # Encode labels to one hot vectors
    y = to_categorical(y, num_classes = (np.max(y)+1))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=2)
    
  if train_df.shape[0]==637:              #delete
    # drop target columns
    X = train_df.drop(labels = ["label"],axis = 1)
    y = train_df["label"] 
    X = train_df.iloc[:,1:].values
    test = test_df.iloc[:,1:].values
    X = X/255.0
    test = test/255.0
    y = to_categorical(y, num_classes =(np.max(y)+1))
    # split data into training & validation
    X_test = X[:80]
    y_test = y[:80]
    X_train = X[80:]
    y_train = y[80:]
    
  if train_df.shape[0]==2400:
    # Degree to radian
    train_df['alpha_rad'] = np.radians(train_df['lattice_angle_alpha_degree'])
    train_df['beta_rad'] = np.radians(train_df['lattice_angle_beta_degree'])
    train_df['gamma_rad'] = np.radians(train_df['lattice_angle_gamma_degree'])
    test_df['alpha_rad'] = np.radians(test_df['lattice_angle_alpha_degree'])
    test_df['beta_rad'] = np.radians(test_df['lattice_angle_beta_degree'])
    test_df['gamma_rad'] = np.radians(test_df['lattice_angle_gamma_degree'])
    def vol(df):
        volumn = df['lattice_vector_1_ang']*df['lattice_vector_2_ang']*df['lattice_vector_3_ang']*np.sqrt(
        1 + 2*np.cos(df['alpha_rad'])*np.cos(df['beta_rad'])*np.cos(df['gamma_rad'])
        -np.cos(df['alpha_rad'])**2-np.cos(df['beta_rad'])**2-np.cos(df['gamma_rad'])**2)
        df['volumn'] = volumn
    vol(train_df)
    vol(test_df)
    # Atomic density
    train_df['density'] = train_df['number_of_total_atoms'] / train_df['volumn']
    test_df['density'] = test_df['number_of_total_atoms'] / test_df['volumn']
    col = ['formation_energy_ev_natom','bandgap_energy_ev']
    X_train = train_df.drop(['id']+col,axis=1)
    y_train = train_df[col]
    X_test = test_df.drop(['id'],axis=1)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test= sc.fit_transform(X_test)
    y_train = y_train.values
    y_test = 0
    
  if train_df.shape[0]==19579:                  #author
    from sklearn import preprocessing
    test_data = test_df.loc[:,'text'].reset_index(drop=True)
    stratifiedCV = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=0.10, random_state=1)
    trainInds, validInds = next(stratifiedCV.split(train_df['text'], train_df['author']))
    X_train = train_df.loc[trainInds,'text'].reset_index(drop=True)
    X_test  = train_df.loc[validInds,'text'].reset_index(drop=True)
    trainLabel = train_df.loc[trainInds,'author'].reset_index(drop=True)
    validLabel = train_df.loc[validInds,'author'].reset_index(drop=True)
    yLabelEncoder = preprocessing.LabelEncoder()
    yLabelEncoder.fit(pd.concat((trainLabel,validLabel)))
    y_train = yLabelEncoder.transform(trainLabel)
    y_test = yLabelEncoder.transform(validLabel)
    
  if train_df.shape[0]==7395:                    #stumbleupon,137
    X_train = list(np.array(train_df)[:,2])
    X_test = list(np.array(test_df)[:,2])
    y_train = np.array(train_df)[:,-1]
    y_train = y_train.astype('int')
    y_test = 0
    
  if train_df.shape[0]==49352:
    X_train=train_df
    X_test=test_df
    interest_level_map = {'low': 0, 'medium': 1, 'high': 2}
    X_train['interest_level'] = X_train['interest_level'].apply(lambda x: interest_level_map[x])
    X_test['interest_level'] = -1
    X_train['price'].ix[X_train['price']>13000] = 13000
    #add features
    feature_transform = CountVectorizer(stop_words='english', max_features=150)
    X_train['features'] = X_train["features"].apply(lambda x: " ".join(["_".join(i.lower().split(" ")) for i in x]))
    X_test['features'] = X_test["features"].apply(lambda x: " ".join(["_".join(i.lower().split(" ")) for i in x]))
    feature_transform.fit(list(X_train['features']) + list(X_test['features']))
    train_size = len(X_train)
    low_count = len(X_train[X_train['interest_level'] == 0])
    medium_count = len(X_train[X_train['interest_level'] == 1])
    high_count = len(X_train[X_train['interest_level'] == 2])

    def find_objects_with_only_one_record(feature_name):
        temp = pd.concat([X_train[feature_name].reset_index(), 
                          X_test[feature_name].reset_index()])
        temp = temp.groupby(feature_name, as_index = False).count()
        return temp[temp['index'] == 1]
    managers_with_one_lot = find_objects_with_only_one_record('manager_id')
    buildings_with_one_lot = find_objects_with_only_one_record('building_id')
    addresses_with_one_lot = find_objects_with_only_one_record('display_address')
    lambda_val = None
    k=5.0
    f=1.0
    r_k=0.01 
    g = 1.0
    def categorical_average(variable, y, pred_0, feature_name):
        def calculate_average(sub1, sub2):
            s = pd.DataFrame(data = {
                                     variable: sub1.groupby(variable, as_index = False).count()[variable],                              
                                     'sumy': sub1.groupby(variable, as_index = False).sum()['y'],
                                     'avgY': sub1.groupby(variable, as_index = False).mean()['y'],
                                     'cnt': sub1.groupby(variable, as_index = False).count()['y']
                                     })
            tmp = sub2.merge(s.reset_index(), how='left', left_on=variable, right_on=variable) 
            del tmp['index']                       
            tmp.loc[pd.isnull(tmp['cnt']), 'cnt'] = 0.0
            tmp.loc[pd.isnull(tmp['cnt']), 'sumy'] = 0.0
            def compute_beta(row):
                cnt = row['cnt'] if row['cnt'] < 200 else float('inf')
                return 1.0 / (g + exp((cnt - k) / f))
            if lambda_val is not None:
                tmp['beta'] = lambda_val
            else:
                tmp['beta'] = tmp.apply(compute_beta, axis = 1)
            tmp['adj_avg'] = tmp.apply(lambda row: (1.0 - row['beta']) * row['avgY'] + row['beta'] * row['pred_0'],
                                       axis = 1)
            tmp.loc[pd.isnull(tmp['avgY']), 'avgY'] = tmp.loc[pd.isnull(tmp['avgY']), 'pred_0']
            tmp.loc[pd.isnull(tmp['adj_avg']), 'adj_avg'] = tmp.loc[pd.isnull(tmp['adj_avg']), 'pred_0']
            tmp['random'] = np.random.uniform(size = len(tmp))
            tmp['adj_avg'] = tmp.apply(lambda row: row['adj_avg'] *(1 + (row['random'] - 0.5) * r_k),
                                       axis = 1)
            return tmp['adj_avg'].ravel()
        #cv for training set 
        k_fold = StratifiedKFold(5)
        X_train[feature_name] = -999 
        for (train_index, cv_index) in k_fold.split(np.zeros(len(X_train)),
                                                    X_train['interest_level'].ravel()):
            sub = pd.DataFrame(data = {variable: X_train[variable],
                                       'y': X_train[y],
                                       'pred_0': X_train[pred_0]})
            sub1 = sub.iloc[train_index]        
            sub2 = sub.iloc[cv_index]
            X_train.loc[cv_index, feature_name] = calculate_average(sub1, sub2)
        #for test set
        sub1 = pd.DataFrame(data = {variable: X_train[variable],
                                    'y': X_train[y],
                                    'pred_0': X_train[pred_0]})
        sub2 = pd.DataFrame(data = {variable: X_test[variable],
                                    'y': X_test[y],
                                    'pred_0': X_test[pred_0]})
        X_test.loc[:, feature_name] = calculate_average(sub1, sub2)                               
    def transform_data(X):
        #add features    
        feat_sparse = feature_transform.transform(X["features"])
        vocabulary = feature_transform.vocabulary_
        del X['features']
        X1 = pd.DataFrame([ pd.Series(feat_sparse[i].toarray().ravel()) for i in np.arange(feat_sparse.shape[0]) ])
        X1.columns = list(sorted(vocabulary.keys()))
        X = pd.concat([X.reset_index(), X1.reset_index()], axis = 1)
        del X['index']
        X["num_photos"] = X["photos"].apply(len)
        X['created'] = pd.to_datetime(X["created"])
        X["num_description_words"] = X["description"].apply(lambda x: len(x.split(" ")))
        X['price_per_bed'] = X['price'] / X['bedrooms']    
        X['price_per_bath'] = X['price'] / X['bathrooms']
        X['price_per_room'] = X['price'] / (X['bathrooms'] + X['bedrooms'] )
        X['low'] = 0
        X.loc[X['interest_level'] == 0, 'low'] = 1
        X['medium'] = 0
        X.loc[X['interest_level'] == 1, 'medium'] = 1
        X['high'] = 0
        X.loc[X['interest_level'] == 2, 'high'] = 1
        X['display_address'] = X['display_address'].apply(lambda x: x.lower().strip())
        X['street_address'] = X['street_address'].apply(lambda x: x.lower().strip())
        X['pred0_low'] = low_count * 1.0 / train_size
        X['pred0_medium'] = medium_count * 1.0 / train_size
        X['pred0_high'] = high_count * 1.0 / train_size
        X.loc[X['manager_id'].isin(managers_with_one_lot['manager_id'].ravel()), 
              'manager_id'] = "-1"
        X.loc[X['building_id'].isin(buildings_with_one_lot['building_id'].ravel()), 
              'building_id'] = "-1"
        X.loc[X['display_address'].isin(addresses_with_one_lot['display_address'].ravel()), 
              'display_address'] = "-1"
        return X
    def normalize_high_cordiality_data():
        high_cardinality = ["building_id", "manager_id"]
        for c in high_cardinality:
            categorical_average(c, "medium", "pred0_medium", c + "_mean_medium")
            categorical_average(c, "high", "pred0_high", c + "_mean_high")
    def transform_categorical_data():
        categorical = ['building_id', 'manager_id', 
                       'display_address', 'street_address']
        for f in categorical:
            encoder = LabelEncoder()
            encoder.fit(list(X_train[f]) + list(X_test[f])) 
            X_train[f] = encoder.transform(X_train[f].ravel())
            X_test[f] = encoder.transform(X_test[f].ravel())
    def remove_columns(X):
        columns = ["photos", "pred0_high", "pred0_low", "pred0_medium",
                   "description", "low", "medium", "high",
                   "interest_level", "created"]
        for c in columns:
            del X[c]
    print("Starting transformations")        
    X_train= transform_data(X_train)    
    X_test = transform_data(X_test) 
    y_train = X_train['interest_level'].ravel()
    print("Normalizing high cordiality data...")
    normalize_high_cordiality_data()
    transform_categorical_data()
    remove_columns(X_train)
    remove_columns(X_test)
    y_test = 0
    
  return X_train, X_test, y_train, y_test, y_test_age, y_train_age, test, test_data


# Feature Extraction

In [0]:
def feature_extraction(X_train, X_test, y_train, y_test):
  
  if y_train.shape[0]==7395:
    tfv = TfidfVectorizer(min_df=3,  max_features=None, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1)
    X_all = X_train + X_test
    tfv.fit(X_all)
    X_train_f = tfv.transform(X_train)
    X_test_f = tfv.transform(X_test)
  else:
    X_train_f = 0 
    X_test_f = 0

  return X_train_f, X_test_f

# Feature Selection

In [0]:
def feature_selection():
  pass


# Estimation

In [0]:
def estimation(X_train, X_test, y_train, y_test, test, test_data, X_train_featured, X_test_featured, 
               y_test_age, y_train_age): 
  
  ##################### Tensorflow&CNN: letseat ######################
  if y_train.shape[0] == 557:
    
    LEARNING_RATE = 1e-4 
    TRAINING_ITERATIONS = 1000        
    DROPOUT = 0.5
    BATCH_SIZE = 40
    IMAGE_DIMENSION_SIZE = 28
    def weight_variable(shape):
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial)
    def bias_variable(shape):
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial)
    def conv2d(x, W):
        return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
    def max_pool_2x2(x):
        return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
    x = tf.placeholder('float', shape=[None, 784])
    y_ = tf.placeholder('float', shape=[None, 3])
    W_conv1 = weight_variable([5, 5, 1, 32])
    b_conv1 = bias_variable([32])
    image = tf.reshape(x, [-1,28,28,1])
    h_conv1 = tf.nn.relu(conv2d(image, W_conv1) + b_conv1)
    h_pool1 = max_pool_2x2(h_conv1)
    layer1 = tf.reshape(h_conv1, (-1, 28, 28, 4 ,8))  
    layer1 = tf.transpose(layer1, (0, 3, 1, 4,2))
    layer1 = tf.reshape(layer1, (-1, 28*4, 28*8)) 
    W_conv2 = weight_variable([5, 5, 32, 64])
    b_conv2 = bias_variable([64])
    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
    h_pool2 = max_pool_2x2(h_conv2)
    layer2 = tf.reshape(h_conv2, (-1, 14, 14, 4 ,16))  
    layer2 = tf.transpose(layer2, (0, 3, 1, 4,2))
    layer2 = tf.reshape(layer2, (-1, 14*4, 14*16)) 
    W_fc1 = weight_variable([IMAGE_DIMENSION_SIZE*IMAGE_DIMENSION_SIZE*4, 1024])
    b_fc1 = bias_variable([1024])
    h_pool2_flat = tf.reshape(h_pool2, [-1, IMAGE_DIMENSION_SIZE*IMAGE_DIMENSION_SIZE*4])
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
    keep_prob = tf.placeholder('float')
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
    W_fc2 = weight_variable([1024, 3])
    b_fc2 = bias_variable([3])
    y = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
    cross_entropy = -tf.reduce_sum(y_*tf.log(y))
    train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
    predict = tf.argmax(y,1)    
    num_examples = X_train.shape[0]
    def next_batch(batch_size):
        global X_train
        global y_train
        global index_in_epoch
        global epochs_completed
        start = index_in_epoch
        index_in_epoch += batch_size    
        if index_in_epoch > num_examples:
            epochs_completed += 1
            perm = np.arange(num_examples)
            np.random.shuffle(perm)
            X_train = X_train[perm]
            y_train = y_train[perm]
            start = 0
            index_in_epoch = batch_size
            assert batch_size <= num_examples
        end = index_in_epoch
        return X_train[start:end], y_train[start:end]
    init = tf.initialize_all_variables()
    sess = tf.InteractiveSession()
    sess.run(init)
    train_accuracies = []
    validation_accuracies = []
    x_range = []
    display_step=1
    for i in range(TRAINING_ITERATIONS):
        batch_xs, batch_ys = next_batch(BATCH_SIZE)        
        if i%display_step == 0 or (i+1) == TRAINING_ITERATIONS:
            train_accuracy = accuracy.eval(feed_dict={x:batch_xs, y_: batch_ys, keep_prob: 1.0})       
            if(80):
                validation_accuracy = accuracy.eval(feed_dict={ x: X_test[0:BATCH_SIZE],y_: y_test[0:BATCH_SIZE], keep_prob: 1.0})                                  
                print('training_accuracy / validation_accuracy => %.2f / %.2f for step %d'%(train_accuracy, validation_accuracy, i))
                validation_accuracies.append(validation_accuracy)
            else:
                 print('training_accuracy => %.4f for step %d'%(train_accuracy, i))
            train_accuracies.append(train_accuracy)
            x_range.append(i)
            if i%(display_step*10) == 0 and i:
                display_step *= 10
        sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys, keep_prob: DROPOUT}) 
    if(80):
        validation_accuracy = accuracy.eval(feed_dict={x: X_test,y_: y_test,keep_prob: 1.0})
        print('validation_accuracy => %.4f'%validation_accuracy)
    sess.close()
    
  ##################### Tensorflow&CNN: letseat ######################
  
  ######################### Keras&CNN: digit ######################### 
  if y_train.shape[0]==37800:
    
    #CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out
    model = Sequential()
    model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu', input_shape = (28,28,1)))
    model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu'))
    model.add(MaxPool2D(pool_size=(2,2)))
    model.add(Dropout(0.25))
    model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', activation ='relu'))
    model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', activation ='relu'))
    model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(256, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation = "softmax"))
    # Define the optimizer
    optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    # Compile the model
    model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])
    # Set a learning rate annealer
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=3, verbose=1, factor=0.5, min_lr=0.00001)   
    epochs = 1 # Turn epochs to 30 to get 0.9967 accuracy
    batch_size = 86
    # With data augmentation to prevent overfitting (accuracy 0.99286)
    datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)  # randomly flip images
    datagen.fit(X_train)
    # Fit the model
    history = model.fit_generator(datagen.flow(X_train,y_train, batch_size=batch_size),
                              epochs = epochs, validation_data = (X_test,y_test),
                              verbose = 2, steps_per_epoch=X_train.shape[0] // batch_size
                              , callbacks=[learning_rate_reduction])
    results = model.predict(test)
    results = np.argmax(results,axis = 1)
    results = pd.Series(results,name="Label")
    submission = pd.concat([pd.Series(range(1,28001),name = "ImageId"),results],axis = 1)

  ######################### Keras&CNN: digit ######################### 
  
  ####################### Keras&ANN: Conductor #######################
  if y_train.shape[0]==2400:
    def rmsle(y_true,y_pred):
      return np.sqrt(np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean()) 
    regressor = Sequential()
    #1 and hidden layer
    regressor.add(Dense(units = 1024, activation = 'relu', kernel_initializer = 'glorot_uniform',input_dim = X_train.shape[1]))
    regressor.add(Dropout(0.1))
    regressor.add(Dense(units = 512, activation = 'relu', kernel_initializer = 'uniform'))
    regressor.add(Dropout(0.1))
    regressor.add(Dense(units = 64, activation = 'relu', kernel_initializer = 'uniform'))
    regressor.add(Dropout(0.1))
    regressor.add(Dense(units = 2, activation = 'relu', kernel_initializer = 'uniform'))
    #compile ANN
    regressor.compile(optimizer = 'adam', loss = 'mse', metrics =['accuracy'])
    regressor.fit(X_train,y_train,batch_size = 3, epochs = 50, validation_split=0.1)
    #Local CV
    rmsle(y_train,regressor.predict(X_train))
    rmsle = rmsle(y_train,regressor.predict(X_train))
    print(rmsle)
  ####################### Keras&ANN: Conductor #######################
  
  ################### Logistic Regression: Author ####################
  if y_train.shape[0]==17621:
    from sklearn import preprocessing
    ngramLength = 5
    print('-'*52)
    print('fitting "CountVectorizer()" for bag of char %d-grams' %(ngramLength))
    BagOfCharsExtractor = CountVectorizer(min_df=8, max_features=250000, 
                                          analyzer='char', ngram_range=(1,ngramLength), 
                                          binary=False,lowercase=True)
    BagOfCharsExtractor.fit(pd.concat((X_train,X_test,test_data)))
    X_train_char = BagOfCharsExtractor.transform(X_train)
    X_valid_char = BagOfCharsExtractor.transform(X_test)
    X_test_char  = BagOfCharsExtractor.transform(test_data)
    ngramLength = 2
    print('-'*52)
    print('fitting "CountVectorizer()" for bag of word %d-grams' %(ngramLength))
    BagOfWordsExtractor = CountVectorizer(min_df=5, max_features=250000, 
                                          analyzer='word', ngram_range=(1,ngramLength), 
                                          binary=False,lowercase=True)
    BagOfWordsExtractor.fit(pd.concat((X_train,X_test,test_data)))
    X_train_word = BagOfWordsExtractor.transform(X_train)
    X_valid_word = BagOfWordsExtractor.transform(X_test)
    X_test_word  = BagOfWordsExtractor.transform(test_data)
    # combine and scale features 
    X_train = scipy.sparse.hstack((X_train_word,X_train_char))
    X_test = scipy.sparse.hstack((X_valid_word,X_valid_char))
    test_data  = scipy.sparse.hstack((X_test_word,X_test_char))
    stdScaler = preprocessing.StandardScaler(with_mean=False)
    stdScaler.fit(scipy.sparse.vstack(((X_train,X_test,test_data))))

    X_train = stdScaler.transform(X_train)
    X_test = stdScaler.transform(X_test)
    print('fitting "LogisticRegression()" classifier')
    logisticRegressor = linear_model.LogisticRegression(C=0.01, solver='sag')
    logisticRegressor.fit(X_train, y_train)
    trainAccuracy = accuracy_score(y_train, logisticRegressor.predict(X_train))
    trainLogLoss = log_loss(y_train, logisticRegressor.predict_proba(X_train))
    validAccuracy = accuracy_score(y_test, logisticRegressor.predict(X_test))
    validLogLoss = log_loss(y_test, logisticRegressor.predict_proba(X_test))
    print('Train: %.1f%s Accuracy, log loss = %.4f' % (100*trainAccuracy,'%',trainLogLoss))
    print('Valid: %.1f%s Accuracy, log loss = %.4f' % (100*validAccuracy,'%',validLogLoss))
  ################## Logistic Regression: Author ####################
  
  ############### Logistic Regression: Stumbleupon ##################
  if y_train.shape[0]==7395:
    import sklearn.linear_model as lm
    rd = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                             C=1, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=None)
    print ("20 Fold CV Score: ", np.mean(model_selection.cross_val_score(rd, X_train_featured, y_train, cv=20, scoring='roc_auc')))
  ############### Logistic Regression: Stumbleupon ##################
  
  ########################## Xgboost: Sigma #########################
  if y_train.shape[0]==49352:
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 3
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = 321
    param['nthread'] = 8
    param['verbose'] = 1
    param['print_every_n'] = 1
    num_rounds = 300
    xgtrain = xgb.DMatrix(X_train , label=y_train)
    watchlist = [(xgtrain, 'train')]
    clf = xgb.train(param, xgtrain, num_rounds, watchlist)
    print("Fitted")
  ########################## Xgboost: Sigma #########################

# Postprocessing

In [0]:
def postprocessing():
  pass

# Running

Please refer to different training and testing dataset.


In [0]:
import warnings
warnings.filterwarnings('ignore')
drive.mount('/content/gdrive')

row_ids = [3, 228, 379, 335, 137, 504]
# row_ids = [504]
metadata={}

#Set current working directory
cwd = 'gdrive/My Drive/Introduction to Data Science Spring 2019 Term Project/jy2823_yz4953/'

for row_id in row_ids:
  metadata.clear()
  print("************************************************************")  
  parseMetaData(row_id)
  
  if row_id == 3:
    train_dir = cwd + metadata['competition_name'] + '/data/train.csv' 
    test_dir = cwd + metadata['competition_name'] + '/data/test.csv'
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)
  if row_id == 228:
    train_dir = cwd + metadata['competition_name'] + '/data/utensils_train.csv' 
    test_dir = cwd + metadata['competition_name'] + '/data/utensils_test.csv'
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)
  if row_id == 379:
    train_dir = cwd + metadata['competition_name'] + '/data/train.csv' 
    test_dir = cwd + metadata['competition_name'] + '/data/test.csv'
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)
  if row_id == 335:
    train_dir = cwd + metadata['competition_name'] + '/data/train.csv' 
    test_dir = cwd + metadata['competition_name'] + '/data/test.csv'
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)
  if row_id == 137:
    train_dir = cwd + metadata['competition_name'] + '/data/train.tsv' 
    test_dir = cwd + metadata['competition_name'] + '/data/test.tsv'
    train_df = pd.read_table(train_dir)
    test_df = pd.read_table(test_dir)
  if row_id == 504:
    train_dir = cwd + metadata['competition_name'] + '/data/train.json' 
    test_dir = cwd + metadata['competition_name'] + '/data/test.json'
    train_df = pd.read_json(train_dir)
    test_df = pd.read_json(test_dir)
    
  X_train, X_test, y_train, y_test, y_test_age, y_train_age, test, test_data = preprocessing(train_df, test_df)
  X_train_featured, X_test_featured = feature_extraction(X_train, X_test, y_train, y_test)
  estimation(X_train, X_test, y_train, y_test, test, test_data, X_train_featured, X_test_featured, y_test_age, y_train_age)
  print("************************************************************")

Mounted at /content/gdrive
************************************************************
digit-recognizer
['pixel0', 'pixel783']
['label']
[]
Label
accuracy

RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/1
 - 290s - loss: 0.4023 - acc: 0.8720 - val_loss: 0.0553 - val_acc: 0.9824
************************************************************
************************************************************
labelme-lets-eat
['Pixel0', 'Pixel1', 'Pixel98']
[]
[]
Label
accuracy

AdamOptimizer(LEARNING_RATE).minimize(cross_entropy)
Instructions for updating:
Use `tf.global_variables_initializer` instead.
training_accuracy / validation_accuracy => 0.38 / 0.30 for step 0
training_accuracy / validation_accuracy => 0.20 / 0.30 for step 1
training_accuracy /

In [0]:
end = time.time()
print(end - start)

1508.5137917995453
