In [135]:
import numpy as np
import pandas as pd
import math
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA,TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection,SparseRandomProjection
from sklearn.feature_extraction import DictVectorizer
from xgboost.sklearn import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

# import h2o
# from h2o.estimators.random_forest import H2ORandomForestEstimator
from sklearn.ensemble import RandomForestRegressor

In [136]:
import time
from keras.callbacks import ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import RMSprop
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.models import Sequential
from keras.utils.np_utils import to_categorical  # convert to one-hot-encoding
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')

start = time.time()

meta_file = pd.read_csv("./AutoKaggle - Metadata.csv",encoding='cp1252', error_bad_lines=False)
arrOfRows = [64, 360, 239, 316, 515, 518, 523, 451]
nlp_rows = [239]
tabular_rows = [64, 360, 316, 515, 518,535]
row = 64
competition_name = meta_file['name'].loc[row]
print(meta_file['name'].loc[row])
dir_name = "./" + competition_name
train_file = dir_name + "/data/train.csv"
row_file = dir_name + "/submission/row.csv"

meta = pd.read_csv(row_file)


def preprocessing(row):
    check_pred = False
    train_X = ''
    train_Y = ''
    test = None
    auxiliary = []
    if meta['name'].loc[0] == 'kobe-bryant-shot-selection':
        train = pd.read_csv("./kobe-bryant-shot-selection/data/data.csv")
#         check_pred = False
        train = train.dropna()
        train_Y = train['shot_made_flag']
        train = train.drop(['shot_made_flag'],axis=1)
        for c in train.columns:
            if train[c].dtype == 'object':    # deal with non-numerical values
                lbl = LabelEncoder()
                if check_pred:
                    lbl.fit(list(train[c].values) + list(test[c].values))
                    train[c] = lbl.transform(list(train[c].values))
                    test[c] = lbl.transform(list(test[c].values))
                else:
                    lbl.fit(list(train[c].values))
                    train[c] = lbl.transform(list(train[c].values))
        train_X,test_X,train_Y,test_Y = train_test_split(train, train_Y, test_size=0.3,random_state = 9012)
        # performe data augmentation 
        # idea : according to the feature importance ranking given by the kernel we referred to
        #        the feature importance of locX are less than 0.01% of the two dominant features
        #        This agrees our knowledge of NBA basketball games, that is, shooting from the left
        #        side of the court or right side of the court does not affect  shooting accuracy for 
        #        professional basketball players in NBA. 
        
        #        So we decide to flip the sign of locX (left to right, right to left) and add random noise 
        #        to other features to get more data.
        #        This is similar to flipping the image of basketball court, which is commonly used in data
        #        augmentation for image.
        
        flip_train = train_X.copy(deep=True)
        flip_train['loc_x'] = flip_train['loc_x'].mul(-1) # flip the sign of x
        flip_train['minutes_remaining'] = flip_train['minutes_remaining'].map(lambda x : x+np.random.choice([-1,0,1],p=[0.25,0.5,0.25]))
        flip_train['seconds_remaining'] = flip_train['seconds_remaining'].map(lambda x : x+np.random.choice([-1,0,1],p=[0.25,0.5,0.25]))
        flip_train['loc_y'] = flip_train['loc_y'].map(lambda x : x+np.random.choice([i for i in range(-10,10,1)]))
        flip_train['loc_x'] = flip_train['loc_x'].map(lambda x : x+np.random.choice([i for i in range(-5,5,1)]))
        train_X = pd.concat([train_X,flip_train])
        train_Y = pd.concat([train_Y,train_Y.copy(deep=True)])
        return train_X,train_Y,test_X,test_Y,None
    elif meta['name'].loc[0] == 'mercedes-benz-greener-manufacturing':
        train = pd.read_csv(
            "./mercedes-benz-greener-manufacturing/data/train.csv")
        test = pd.read_csv(
            "./mercedes-benz-greener-manufacturing/data/test.csv")

    elif meta['name'].loc[0] == 'uciml_sms-spam-collection-dataset':
        row = pd.read_csv(
            "./uciml_sms-spam-collection-dataset/submission/row.csv", encoding='cp1252')
        sms = train
        row_prepro = row['preprocessing function call'][0]
        prepro_ls = eval(row_prepro)
        sms = eval(prepro_ls[0])
        train = eval(prepro_ls[1])
        return train
    else:
        train = pd.read_csv(train_file) #changed to here
        target_name = str(meta['targetName'].loc[0])
        train_Y = train[[target_name]]
        train = train.drop(columns=target_name)
        if type(meta['auxiliaryDataURL'].loc[0]) is str:
            auxiliary_calls = eval(meta['auxiliaryDataURL'].loc[0])
            for call in auxiliary_calls:
                auxi = call
                auxiliary.append(pd.read_csv("./" + meta['name'].loc[0] + "/auxiliary_data/" + auxi + ".csv"))
        if type(meta['auxiliary function calls'].loc[0]) is str:
            auxiliary_functions = eval(meta['auxiliary function calls'].loc[0])
            for call in auxiliary_functions:
                exec(call)
        if type(meta['test set'].loc[0]) is str:
            row_file = dir_name + "/data/test.csv"
            test = pd.read_csv(row_file, encoding='cp1252',error_bad_lines=False)
        if type(meta['preprocessing function call'].loc[0]) is str:
            preprocessing_calls = eval(meta['preprocessing function call'].loc[0])
            for call in preprocessing_calls:
                exec(call)
        train = train.dropna()
        if type(meta["unwanted column"].loc[0]) is str:  # check if there's unwanted column
            column_list = eval(meta["unwanted column"].loc[0])
            train = train.drop(column_list, axis=1)
        if type(meta["numeric column"].loc[0]) is str:
            numeric = eval(meta["numeric column"].loc[0])

        if type(meta['augmentation function calls'].loc[0]) is str:
            augmentation_calls = eval(meta['augmentation function calls'].loc[0])
            for call in augmentation_calls:
                exec(call)

        for c in train.columns:
            if train[c].dtype == 'object':    # deal with text
                lbl = LabelEncoder()
                if check_pred:
                    lbl.fit(list(train[c].values) + list(test[c].values))
                    train[c] = lbl.transform(list(train[c].values))
                    test[c] = lbl.transform(list(test[c].values))
                else:
                    lbl.fit(list(train[c].values))
                    train[c] = lbl.transform(list(train[c].values))

        train_X,test_X,train_Y,test_Y = train_test_split(train, train_Y, test_size=0.2)

        if check_pred:
            return train_X, train_Y, test_X, test_Y, pred_X
        else:
            return train_X, train_Y, test_X, test_Y, None  # if check_pred:


preprocessing(row)

kobe-bryant-shot-selection


(       action_type  combined_shot_type  game_event_id   game_id      lat  \
 24641           25                   3            285  29800350  33.9013   
 12860           25                   3            380  20701018  33.8603   
 19424            5                   4             41  21200012  34.0373   
 13853           35                   0            363  20800538  33.9993   
 25495           49                   5            130  29900582  34.0443   
 27449           25                   3             27  40300233  33.8193   
 14099           25                   3            183  20800709  34.0373   
 9791            25                   3            192  20600127  33.9163   
 23072           40                   3            480  29600603  34.0213   
 9624            25                   3             22  20501185  33.8223   
 26503           40                   3            144  40000085  33.9513   
 17194           25                   3            324  21000560  33.9133   

In [137]:
def text_process(text):

    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower()
            not in stopwords.words('english')]

    return " ".join(text)

def feature_extraction(row, X_train, X_test, X_pred):
    if meta['name'].loc[0] == 'uciml_sms-spam-collection-dataset':
        rowcsv = pd.read_csv(
            "./uciml_sms-spam-collection-dataset/submission/row.csv")
        row_extract = rowcsv['featureExtractor function call'].loc[0]
        sms = X_train
        extract = eval(row_extract)
        sms['message'] = eval(extract[0])
        sms['message'] = eval(extract[1])
        text_feat = sms['message'].apply(str).copy()
        text_feat = eval(extract[2])
        vectorizer = eval(extract[3])
        features = eval(extract[4])
        features_train, features_test, labels_train, labels_test = train_test_split(
            features, sms['label'], test_size=0.3)
        return features_train, features_test, labels_train, labels_test
    else:
        if type(meta["featureExtractor function call"].loc[0]) is not str:
            print('not func')
            return X_train, X_test, X_pred
        extraction_function_calls = str(
            row_data["function call feature extraction"].loc[0])
        extraction_function_calls = extraction_function_calls.split(",")
        extraction_funtion_param = eval(
            row_data["function parameters feature extraction"].loc[0])
        function_nums = len(extraction_function_calls)
        for i in range(function_nums):
            str1 = extraction_function_calls[i]
            str2 = extraction_funtion_param[i]
            l_str = str1.split("(")
            l_str.insert(1, "("+str2)
            str_call = ''
            str_call = str_call.join(l_str)
            str_call = 'extractor' + '=' + str_call
            exec(str_call, globals(), globals())
            extracted_train = extractor.fit_transform(X_train)
            n_comp = extracted_train.shape[1]
            for j in range(0, n_comp):
                X_train['extractor' + str(i)+"_"+str(j)
                        ] = extracted_train[:, j]
            if X_test is not None:
                extracted_test = extractor.fit_transform(X_test)
                for j in range(0, n_comp):
                    X_test['extractor' + str(i)+"_" +
                           str(j)] = extracted_test[:, j]
                return X_train, X_test
            else:
                return X_train, None

In [138]:
def feature_selection():
    pass

In [139]:
def estimation(row, X_train, X_test, Y_train, Y_test):
    if row in nlp_rows:
        rowcsv = pd.read_csv(
            "./uciml_sms-spam-collection-dataset/submission/row.csv")
        row_extract = eval(rowcsv['estimator1 function call'].loc[0])
        mnb = eval(row_extract[0])
        eval(row_extract[1])
        pred = eval(row_extract[2])
        if rowcsv['performanceMetric'].loc[0] == 'accuracy':
            return accuracy_score(Y_test, pred)
    else:
        estimation_function_calls = eval(
            meta["estimator1 function call"].loc[0])
        print(estimation_function_calls)
        if len(estimation_function_calls) == 1:
            if type(meta['neural network initialization'].loc[0]) is str:
                neural_net_calls = eval(
                    meta['neural network initialization'].loc[0])
                for call in neural_net_calls:
                    exec(call)
            else:
                str_call = estimation_function_calls[0]
                str_call = 'estimator' + '=' + str_call
                exec(str_call, globals(), globals())
                estimator.fit(X_train, Y_train)
                Y_pred = estimator.predict(X_test)
                if meta["taskType"].loc[0] == 'classification':
                    print(accuracy_score(Y_test, Y_pred))
                if meta["taskType"].loc[0] == 'regression':
                    print("Mean Squared Error is: ",
                          mean_squared_error(Y_test, Y_pred))
        else:
            estimators = []
            n_estimators = len(estimation_function_calls)
            for i in range(n_estimators):
                str1 = extraction_function_calls
                l_str = str1.split("(")
                l_str.insert(1, "("+str2)
                str_call = ''
                str_call = str_call.join(l_str)
                str_call = 'estimator' + '=' + str_call
                print(l_str)
                print(str_call)
                exec(str_call)
                estimators.append(estimator)
                postprocessing(estimators, stack=True)

In [140]:
if row in nlp_rows:
    train_set = preprocessing(row)
    X_train, X_test, Y_train, Y_test = feature_extraction(row, train_set, None)
    Y_pred = estimation(row, X_train, X_test, Y_train, Y_test)
    print(Y_pred)


if row in tabular_rows:
    X_train, Y_train, X_test, Y_test, X_pred = preprocessing(row)
    X_train, X_test, X_pred = feature_extraction(row, X_train, X_test, X_pred)
    estimation(row, X_train, X_test, Y_train, Y_test)

not func
['RandomForestClassifier(n_jobs=-1, n_estimators=70, max_depth=7, random_state=2016)']
0.6736705577172504


In [141]:
# if meta['name'].loc[row] == 'uciml_sms-spam-collection-dataset':
#     train_set = preprocessing(row)
#     X_train, X_test, Y_train, Y_test = feature_extraction(row, train_set, None)
#     Y_pred = estimation(row, X_train, X_test, Y_train, Y_test)
#     print(Y_pred)

In [142]:
def postprocessing(estimators,stack):
    pass

In [143]:
end = time.time()
print("Use", end - start, "seconds to run this.")

Use 10.420620679855347 seconds to run this.
