In [0]:
import os
import cv2
import json
import random
import gc 
import numpy as np
import pandas as pd

import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.ensemble as ensemble
import sklearn.neighbors as neighbors
import sklearn.naive_bayes as naive_bayes
import sklearn.linear_model as linear_model

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing as preproc
from sklearn import model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, mean_absolute_error, roc_curve, auc, confusion_matrix

from google.colab import files
from google.colab import drive

import time

# Google Colab Setting

In [220]:
# Accessing Google sheets
!pip install --upgrade -q gspread
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

worksheet = gc.open('Metadata1').worksheet('Sheet1')

# get_all_values gives a list of rows
rows_ = worksheet.get_all_values()
print(rows_)

#rows = pd.read_excel('Metadata.xlsx')
# Convert to a DataFrame and render.
#import pandas as pd
rows = pd.DataFrame.from_records(rows_)
print(rows)

new_header = rows.iloc[0] #grab the first row for the header
rows = rows[1:] #take the data less the header row
rows.columns = new_header #set the header row as the df header

[['Id', '', 'name', 'URL', 'endDate', 'dataSize', 'table', 'image', 'audio', 'video', 'data type', 'text/csv', 'text/json', 'text/tab-separated-values', 'image/bmp', 'image/jpeg', 'image/png', 'image/tiff', 'audio/x-wav', 'audio/x-aiff', 'video/mp4', 'data format', 'columns [index;name;type;...] for type use categorical, numerical, string, integer, dateTime etc', 'augmented dataset URL', 'taskType', 'taskSubType', 'outputType', 'targetIndex', 'targetName', 'rawData (non csv) ', 'rawDataIndex', 'problemDescription', 'preprocessing', 'preprocessing function call', 'featureExtractor', 'featureExtractor function call', 'featureSelector', 'featureSelector function call', 'sklearn', 'xgboost', 'keras', 'tensorflow', 'lightgbm', 'Libraries', 'estimators', 'estimator1', 'estimator1 function call', 'estimator2', 'estimator2 function call', 'estimator3', 'estimator3 function call', 'postprocessing', 'postprocessing function call', 'performanceMetric', 'crossValidationPerformance', 'codeURIRunnin

In [0]:
# import xlrd
# # worksheet = xlrd.open_workbook('Metadata.xlsx').worksheet('Sheet1')

# workbook = xlrd.open_workbook('Metadata.xlsx')

# worksheet = workbook.sheet_by_index(0)
# # get_all_values gives a list of rows
# rows_ = worksheet.get_all_values()

# #rows = pd.read_excel('Metadata.xlsx')
# # Convert to a DataFrame and render.
# #import pandas as pd
# rows = pd.DataFrame.from_records(rows_)
# print(rows)

# new_header = rows.iloc[0] #grab the first row for the header
# rows = rows[1:] #take the data less the header row
# rows.columns = new_header #set the header row as the df header

In [0]:
def alpha_to_number(alpha_key):
  return sum([(ord(alpha)-64)*(26**ind) for ind, alpha in enumerate(list(alpha_key)[::-1])]) - 1

# Mapping from Metadata sheet column name to readable columns
column_key = {'name': 'C', 'columns': 'W', 'estimator_func_call': 'AU', 'target_name': 'AC', 'output_type': 'AA', 'performance_metric': 'BB', 'feature_selector': 'AL', 'data_form': 'V','feature_extractor':'AJ'}
column_key = dict(map(lambda kv: (kv[0], alpha_to_number(kv[1])), column_key.items()))

# Mount at Google Drive

If cannot read from the file,  please rerun this statement until "gdrive/My Drive" appears on the left bar

In [223]:
# Mount Google Drive
drive.mount('/gdrive')

Mounted at /gdrive


# Metadata Parsing

In [0]:
def parseMetaData(row_id):
  
  
  metadata['competition_name'] = rows.loc[row_id][column_key['name']]
  metadata['estimator'] = rows.loc[row_id][column_key['estimator_func_call']]
  metadata['target_column'] = rows.loc[row_id][column_key['target_name']]
  metadata['output_type'] = rows.loc[row_id][column_key['output_type']].split(',')
  metadata['metric'] = rows.loc[row_id][column_key['performance_metric']]
  metadata['feature_selector'] = rows.loc[row_id][column_key['feature_selector']]
  metadata['feature_extractor'] = rows.loc[row_id][column_key['feature_extractor']]
  metadata['data_form'] = rows.loc[row_id][column_key['data_form']]
  columns = rows.loc[row_id][column_key['columns']]

  # Parse column information 
  numeric_columns = []
  unwanted_columns = []
  categorical_columns = []
  columns_data = [x.strip() for x in columns[1:-1].split(';')]
  #print(columns_data)
  for ind, val in enumerate(columns_data):
    if ind%3 == 2:
      if (val == "numeric" or val == "integer" or val == "real"):
        numeric_columns.append(columns_data[ind-1])
      elif val == "categorical":
        categorical_columns.append(columns_data[ind-1])
      elif val == "unwanted" or val == "string" or val == 'dateTime':
        unwanted_columns.append(columns_data[ind-1])
    else:
      pass 
  metadata['numeric_columns'] = numeric_columns
  metadata['unwanted_columns'] = unwanted_columns
  metadata['categorical_columns'] = categorical_columns
  
  # Remove target from features columns
  if metadata['target_column'] in metadata['numeric_columns']:
    metadata['numeric_columns'].remove(metadata['target_column'])
  if metadata['target_column'] in metadata['categorical_columns']:
    metadata['categorical_columns'].remove(metadata['target_column'])
  if metadata['target_column'] in metadata['unwanted_columns']:
    metadata['unwanted_columns'].remove(metadata['target_column'])
  
  print(metadata['competition_name'])
  print(metadata['numeric_columns'])
  print(metadata['categorical_columns'])
  print(metadata['unwanted_columns'])
  print(metadata['target_column'])
  print(metadata['metric'])
  print(metadata['feature_selector'])
  print(metadata['feature_extractor'])
  print(metadata['estimator'])

# Add relevent import

In [225]:
# Installations
import warnings
import random
from math import exp
warnings.filterwarnings('ignore')

# Imports
# Preprocessing imports
import seaborn as sns
%matplotlib inline
import string
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')

import itertools

from keras.utils.np_utils import to_categorical 
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization, MaxPooling2D
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator,img_to_array,load_img
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
import scipy
import re

# Other initializations
sns.set(style='white', context='notebook', palette='deep')
epochs_completed = 0
index_in_epoch = 0

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Auxiliary

**Applied to Digit Recognizer**

1.   Data Description:




*   Original training data set: 42.0k x 785 (label x 1, pixel x 784)
*   Auxiliary training data set: 60.0k x 785 (label x 1, pixel x 784)

> Reference: https://www.kaggle.com/krissa10/train-digit-recognition-mnist





*   Concat training data set: 102.0k x 785 (label x 1, pixel x 784)









2.   Accuracy Performance:


*   X_train after augmentaion :  0.8656 (training) 0.9783(validation) Running time: 1557084685.2798023s

*   X_train combine with X_train_auxiliary after augmentation: 0.9271(training) 0.9893(validation) Running time: 1557088917.688372s

We can observe a significant impovement in data training and validation 

In [0]:
def create_auxi(train_df):
  print(train_df.shape)
  drive.mount('/content/gdrive')
  cwd = 'gdrive/My Drive/Introduction to Data Science Spring 2019 Term Project/jy2823_yz4953/digit-recognizer'
  aux_dir = cwd + '/auxiliary_data/train_auxiliary.csv'
  aux_df = pd.read_csv(aux_dir)
  train_df = pd.concat([train_df, aux_df], axis=0)
  print(train_df.shape)
  return train_df

# Preprocessing

In [0]:
def preprocessing(train_df):
  if metadata['competition_name']=='dogs-vs-cats-redux-kernels-edition':
    train_dogs = [train_df+'/dog/{}'.format(i) for i in os.listdir(train_df+'/dog') ]  #get dog images if 'dog' in i
    train_cats = [train_df+'/cat/{}'.format(i) for i in os.listdir(train_df+'/cat') ]  #get cat images if 'cat' in i
    train_imgs = train_dogs[2000:4000] + train_cats[2000:4000]  # slice the dataset and use 2000 in each class
    random.shuffle(train_imgs)  # shuffle it randomly
    #Clear list that are useless
    del train_dogs
    del train_cats
    #gc.collect()   #collect garbage to save memory
    nrows = 150
    ncolumns = 150
    channels = 3  #change to 1 if you want to use grayscale image
    #get the train and label data
    X, y = read_and_process_image(train_imgs,nrows,ncolumns)

    import seaborn as sns
    del train_imgs
    #gc.collect()

    #Convert list to numpy array
    X = np.array(X)
    y = np.array(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2)

    #clear memory
    del X
    del y
    #gc.collect()
  
  else:  
    #get X and y from input data
    if metadata['competition_name']=='digit-recognizer':
      train_df = create_auxi(train_df)
      X = train_df.drop(metadata['target_column'], 1)
      y = train_df[metadata['target_column']]
      X = X / 255.0
      # Reshape image in 3 dimensions (height = 28px, width = 28px , canal = 1)
      X = X.values.reshape(-1,28,28,1)
      # Encode labels to one hot vectors
      y = to_categorical(y, num_classes = (np.max(y)+1))
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=2)
    
    elif metadata['competition_name']=='spooky-author-identification':
      #test_data = test_df.loc[:,'text'].reset_index(drop=True)
      stratifiedCV = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=0.10, random_state=1)
      trainInds, validInds = next(stratifiedCV.split(train_df['text'], train_df['author']))
      X_train = train_df.loc[trainInds,'text'].reset_index(drop=True)
      X_test  = train_df.loc[validInds,'text'].reset_index(drop=True)
      trainLabel = train_df.loc[trainInds,'author'].reset_index(drop=True)
      validLabel = train_df.loc[validInds,'author'].reset_index(drop=True)
      yLabelEncoder = preproc.LabelEncoder()
      yLabelEncoder.fit(pd.concat((trainLabel,validLabel)))
      y_train = yLabelEncoder.transform(trainLabel)
      y_test = yLabelEncoder.transform(validLabel)
  
    else:
      X = train_df.drop(metadata['target_column'], 1)
      y = train_df[metadata['target_column']]
      X = X.filter(metadata['numeric_columns'] + metadata['categorical_columns'])
  
      # treat missing values
      pd.set_option('mode.chained_assignment', None) # used to subside the panda's chain assignment warning
      imp = SimpleImputer(missing_values=np.nan, strategy='mean')
      for col in metadata['numeric_columns']:
        X[[col]] = imp.fit_transform(X[[col]])
    
      # Categorial transform  
      for col in metadata['categorical_columns']:
        col_dummies = pd.get_dummies(X[col], dummy_na=True)
        X = pd.concat([X, col_dummies], axis=1)
      X.drop(metadata['categorical_columns'], axis=1, inplace=True)
  
      # Feature normalization
      X[metadata['numeric_columns']] = preproc.scale(X[metadata['numeric_columns']])

      X_train, X_test, y_train, y_test = train_test_split(X, y)
  
      
  return X_train, X_test, y_train, y_test


# Image Preprocessing

In [0]:
#A function to read and process the images to an acceptable format for our model
def read_and_process_image(list_of_images, nrows,ncolumns,):
    """
    Returns two arrays: 
        X is an array of resized images
        y is an array of labels
    """
    X = [] # images
    y = [] # labels
    
    for image in list_of_images:
        X.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR), (nrows,ncolumns), interpolation=cv2.INTER_CUBIC))  #Read the image
        #get the labels
        if 'dog' in image:
            y.append(1)
        elif 'cat' in image:
            y.append(0)
    
    return X, y

# Feature Extraction

In [0]:
def feature_extraction(X_train, X_test, y_train, y_test):
  if metadata['competition_name']=='spooky-author-identification':
    ngramLength = 5
    print('fitting "CountVectorizer()" for bag of char %d-grams' %(ngramLength))
    BagOfCharsExtractor = CountVectorizer(min_df=8, max_features=250000, 
                                          analyzer='char', ngram_range=(1,ngramLength), 
                                          binary=False,lowercase=True)
    BagOfCharsExtractor.fit(pd.concat((X_train,X_test)))
    X_train_char = BagOfCharsExtractor.transform(X_train)
    X_valid_char = BagOfCharsExtractor.transform(X_test)
    ngramLength = 2
    print('fitting "CountVectorizer()" for bag of word %d-grams' %(ngramLength))
    BagOfWordsExtractor = CountVectorizer(min_df=5, max_features=250000, 
                                          analyzer='word', ngram_range=(1,ngramLength), 
                                          binary=False,lowercase=True)
    BagOfWordsExtractor.fit(pd.concat((X_train,X_test)))
    X_train_word = BagOfWordsExtractor.transform(X_train)
    X_valid_word = BagOfWordsExtractor.transform(X_test)
    # combine and scale features 
    X_train = scipy.sparse.hstack((X_train_word,X_train_char))
    X_test = scipy.sparse.hstack((X_valid_word,X_valid_char))
    stdScaler = preproc.StandardScaler(with_mean=False)
    stdScaler.fit(scipy.sparse.vstack(((X_train,X_test))))
    X_train = stdScaler.transform(X_train)
    X_test = stdScaler.transform(X_test)
 
  else:
    extractor = eval(metadata['feature_extractor'])
    X_train = extractor.fit_transform(X_train, y_train)
    X_test = extractor.fit_transform(X_test, y_test)
 
  return X_train, X_test, y_train, y_test

# Feature Selection

In [0]:
def feature_selection(X_train, X_test, y_train, y_test):  
  selector = eval(metadata['feature_selector'])
  X_train = selector.fit_transform(X_train, y_train)
  X_test = selector.fit_transform(X_test, y_test)
  return X_train, X_test, y_train, y_test

# **Data Augmentation**

In [0]:
#for image augmentation
def create_datagen():
  datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)  # randomly flip images
  return datagen
  

# **Neurual Network**

In [0]:
#for CNN layer setting
def CNN1(model):
    model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu', input_shape = (28,28,1)))
    model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu'))
    model.add(MaxPool2D(pool_size=(2,2)))
    model.add(Dropout(0.25))
    model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', activation ='relu'))
    model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', activation ='relu'))
    model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(256, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation = "softmax"))   
    return model

In [0]:
def CNN2(model):
    model.add(Conv2D(32, (3, 3), activation='relu',input_shape=(150, 150, 3)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dropout(0.5))  #Dropout for regularization
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  #Sigmoid function at the end because we have just two classes
    return model

# Estimation

In [0]:
def estimation(X_train, X_test, y_train, y_test): 
  
    start = time.time() 
 
    model = eval(metadata['estimator'])
   
  ######################### Keras&CNN: digit ######################### 
    if metadata['competition_name']=='digit-recognizer':   
      CNN1(model)
      # Define the optimizer
      optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
      # Compile the model
      model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])
      # Set a learning rate annealer
      learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=3, verbose=1, factor=0.5, min_lr=0.00001)   
      epochs = 30 # Turn epochs to 30 to get 0.9967 accuracy
      batch_size = 86
      # With data augmentation to prevent overfitting (accuracy 0.99286)#####################Using Data Augmentation
      datagen = create_datagen()
      datagen.fit(X_train)
      # Fit the model
      history = model.fit_generator(datagen.flow(X_train,y_train, batch_size=batch_size),
                              epochs = epochs, validation_data = (X_test,y_test),
                              verbose = 2, steps_per_epoch=X_train.shape[0] // batch_size
                              , callbacks=[learning_rate_reduction])  
  ######################### Keras&CNN: digit #########################
  
  
  ####################### Keras&CNN: Dog vs Cat ####################### 
    elif metadata['competition_name']=='dogs-vs-cats-redux-kernels-edition':
      ntrain = len(X_train)
      ntest = len(X_test)
      batch_size = 32 
      CNN2(model)
      model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=1e-4), metrics=['acc'])
      #hist=model.fit(X_train,y_train,epochs=64,batch_size=batch_size,validation_data=(X_test,y_test))
  
      #train_datagen,val_datagen=create_datagen2()
      train_datagen=create_datagen()
      train_datagen.fit(X_train)
      #val_datagen.fit(X_test)
      #train_generator = train_datagen.flow(X_train, y_train, batch_size=batch_size)
      #val_generator = val_datagen.flow(X_test, y_test, batch_size=batch_size)
      history = model.fit_generator(train_datagen.flow(X_train, y_train, batch_size=batch_size),
                              steps_per_epoch=ntrain // batch_size,
                              epochs=64,
                              validation_data=(X_test,y_test),
                              validation_steps=ntest // batch_size)

  ####################### Keras&CNN: Dog vs Cat #######################
   
    else:
      model.fit(X_train, y_train)
      predict = model.predict(X_test)
      if metadata['metric'] == "rmse":  
        error = np.sqrt(mean_squared_error(y_test, predict))
      elif metadata['metric'] == "accuracy":
        error = accuracy_score(y_test, predict)
      elif metadata['metric'] == "auc":
        fpr, tpr, _ = roc_curve(y_test, predict)
        error = auc(fpr, tpr)
      elif metadata['metric'] == "logloss":
        proba = model.predict_proba(X_test)
        error = log_loss(y_test, proba)
      print(error)
  
    #print running time
    end = time.time()   
    print("Running time is:"+str(end-start) + 's')

# Running

Please refer to different training and testing dataset.


In [236]:
import warnings
warnings.filterwarnings('ignore')
drive.mount('/content/gdrive')

row_ids = [1,2,3,4] #3digit,534titanic,335author,555dog-vs-cat
metadata={}

#Set current working directory
cwd = 'gdrive/My Drive/Introduction to Data Science Spring 2019 Term Project/jy2823_yz4953/'

for row_id in row_ids:
  metadata.clear()
  print("************************************************************")  
  parseMetaData(row_id)
  if metadata['competition_name']=='dogs-vs-cats-redux-kernels-edition':
    train_df =cwd+metadata['competition_name'] + '/raw data'
  else:  
    competition_dir = cwd + metadata['competition_name'] + '/data/train.'+metadata['data_form']
    #read data for different types
    if metadata['data_form']=='csv':
      train_df = pd.read_csv(competition_dir)

  X_train, X_test, y_train, y_test = preprocessing(train_df)
  if metadata['feature_selector'].lower() != 'none':
     X_train, X_test, y_train, y_test = feature_selection(X_train, X_test, y_train, y_test)
  if metadata['feature_extractor']:
     X_train, X_test, y_train, y_test = feature_extraction(X_train, X_test, y_train, y_test)    
  estimation(X_train, X_test, y_train, y_test)
  print("************************************************************")
#   X_train, X_test, y_train, y_test = preprocessing(train_df)
#   if metadata['feature_selector'].lower() != 'none':
#      X_train, X_test, y_train, y_test = feature_selection(X_train, X_test, y_train, y_test)
#   if metadata['feature_extractor'].lower() !='none':
#     X_train, X_test, y_train, y_test = feature_extraction(X_train, X_test, y_train, y_test)    
#   estimation(X_train, X_test, y_train, y_test)
#   print("************************************************************")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
************************************************************
digit-recognizer
['pixel0', 'pixel783']
['label']
[]
Label
accuracy
none

Sequential()
(42000, 785)
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
(102000, 785)
Epoch 1/30
 - 23s - loss: 0.2414 - acc: 0.9251 - val_loss: 0.0398 - val_acc: 0.9876
Epoch 2/30
 - 21s - loss: 0.0811 - acc: 0.9761 - val_loss: 0.0277 - val_acc: 0.9908
Epoch 3/30
 - 21s - loss: 0.0647 - acc: 0.9816 - val_loss: 0.0299 - val_acc: 0.9921
Epoch 4/30
 - 23s - loss: 0.0603 - acc: 0.9831 - val_loss: 0.0242 - val_acc: 0.9929
Epoch 5/30
 - 22s - loss: 0.0584 - acc: 0.9837 - val_loss: 0.0384 - val_acc: 0.9916
Epoch 6/30
 - 21s - loss: 0.0607 - acc: 0.9836 - val_loss: 0.0281 - val_acc: 0.9928
Epoch 7/30
 - 20s - loss: 0.0625 - acc: 0.9838 - val_loss: 