In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV

def Processing(train,test):
    
    print('Processing data ...')
    
    # Clean missing values
    MissingVal = train[train.columns[1:-1]].isnull().sum().sort_values().apply(lambda x: np.round(x/train.shape[0],2))
    Ind = MissingVal[MissingVal>0.5].index
    train = train.drop(Ind,axis=1)
    test = test.drop(Ind,axis=1)
    
    # Clean single-valued variables
    Features = train[train.columns[1:-1]]
    NumVar,NumVarIndex,ObjVar,ObjVarIndex = [],[],[],[]
    for x in Features.columns[Features.dtypes!='object']:
        NumVar.append(len(Features[x].value_counts()))
        NumVarIndex.append(x)
    for x in Features.columns[Features.dtypes=='object']:
        ObjVar.append(len(Features[x].value_counts()))
        ObjVarIndex.append(x)
    NumVar = pd.Series(NumVar,index=NumVarIndex).sort_values()
    ObjVar = pd.Series(ObjVar,index=ObjVarIndex).sort_values()
    Ind = NumVar[:len(NumVar[NumVar==1])].index
    train = train.drop(Ind,axis=1)
    test = test.drop(Ind,axis=1)
    Ind = ObjVar[:len(ObjVar[ObjVar==1])].index
    train = train.drop(Ind,axis=1)
    test = test.drop(Ind,axis=1)
    NumVar = NumVar[len(NumVar[NumVar==1]):]
    ObjVar = ObjVar[len(ObjVar[ObjVar==1]):]
    
    # Filling missing values
    MissingValTrain = train[1:-1].isnull().sum().sort_values()
    MissingValTest = test[1:-1].isnull().sum().sort_values()
    MissingValTrain = MissingValTrain[MissingValTrain>0].index
    MissingValTest = MissingValTest[MissingValTest>0].index
    for i in MissingValTrain:
        Ind = train[i][train[i].isnull()].index
        train.loc[Ind,i]=train[i].mode()[0]
    for i in MissingValTest:
        Ind = test[i][test[i].isnull()].index
        test.loc[Ind,i]=test[i].mode()[0]
    
    # Encoding string variables
    
    # For variables with only 2 different values, directly apply LabelEncoding inline.
    for i in ObjVar[ObjVar==2].index:
        le=LabelEncoder()
        le.fit(train[i])
        train[i]=le.transform(list(train[i]))
        test[i]=le.transform(list(test[i]))
    
    # For variables with >2 but <=5 different values, directly apply OneHotEncoding and drop the original variables.
    for i in set(ObjVar[ObjVar<=5].index)-set(ObjVar[ObjVar==2].index):
        le = LabelEncoder()
        oh = OneHotEncoder()
        le.fit(train[i])
        train[i] = le.transform(list(train[i]))
        test[i] = le.transform(list(test[i]))
        temp = np.reshape(list(train[i]),(len(train[i]),-1))
        oh.fit(temp)
        temp = pd.DataFrame(oh.transform(temp).toarray(),columns=[i+'_%s'%(x) for x in range(ObjVar[i])])
        train = pd.concat([train,temp],axis=1).drop(i,axis=1)
        temp = np.reshape(list(test[i]),(len(test[i]),-1))
        temp = pd.DataFrame(oh.transform(temp).toarray(),columns=[i+'_%s'%(x) for x in range(ObjVar[i])])
        test = pd.concat([test,temp],axis=1).drop(i,axis=1)
    
    # For VAR_0283, VAR_0305 and VAR_0325, some values only contain very few entries (<1% of loadsize), categorize them as "other" and apply OneHotEncoding.
    # Re-categorize data
    for i in set(ObjVar[ObjVar<10].index)-set(ObjVar[ObjVar<=5].index):
        valcount = train[i].value_counts()
        for j in valcount.index:
            if valcount[j]<100:
                Ind = train[i][train[i]==j].index
                train.loc[Ind,i] = 'Other'
        valcount = test[i].value_counts()
        for j in valcount.index:
            if valcount[j]<100:
                Ind = test[i][test[i]==j].index
                test.loc[Ind,i] = 'Other'
    
    # Now apply OneHotEncoding
    for i in set(ObjVar[ObjVar<10].index)-set(ObjVar[ObjVar<=5].index):
        le = LabelEncoder()
        oh = OneHotEncoder()
        le.fit(train[i])
        train[i] = le.transform(list(train[i]))
        test[i] = le.transform(list(test[i]))
        temp = np.reshape(list(train[i]),(len(train[i]),-1))
        oh.fit(temp)
        temp = pd.DataFrame(oh.transform(temp).toarray(),columns=[i+'_%s'%(x) for x in range(len(train[i].value_counts()))])
        train = pd.concat([train,temp],axis=1).drop(i,axis=1)
        temp = np.reshape(list(test[i]),(len(test[i]),-1))
        temp = pd.DataFrame(oh.transform(temp).toarray(),columns=[i+'_%s'%(x) for x in range(len(test[i].value_counts()))])
        test = pd.concat([test,temp],axis=1).drop(i,axis=1)
        
    # VAR_0237, VAR_0342, VAR_0274 and VAR_0200 are state and names. By intuition we convert these string values to the frequency of each entry to represent the popularity of the service in that area.
    Features = ['VAR_0237', 'VAR_0342', 'VAR_0274', 'VAR_0200']
    for i in Features:
        valcount = train[i].value_counts()
        for j in valcount.index:
            Ind = train[i][train[i] == j].index
            train.loc[Ind,i] = valcount[j]
        valcount = test[i].value_counts()
        for j in valcount.index:
            Ind = test[i][test[i] == j].index
            test.loc[Ind,i] = valcount[j]
            
    # VAR_0217, VAR_0204 and VAR_0075 are datetime type variables, convert them to numerical variables.
    Features = ['VAR_0217', 'VAR_0204', 'VAR_0075']
    Month = {'JAN':'01','FEB':'02','MAR':'03','APR':'04','MAY':'05','JUN':'06','JUL':'07','AUG':'08','SEP':'09','OCT':'10','NOV':'11','DEC':'12'}
    for i in Features:
        train[i] = train[i].apply(lambda x: int('20'+x[5:7]+Month[x[2:5]]+x[:2]+x[8:10]+x[11:13]+x[14:]))
        test[i] = test[i].apply(lambda x: int('20'+x[5:7]+Month[x[2:5]]+x[:2]+x[8:10]+x[11:13]+x[14:]))
        
    # For VAR_0493 and VAR_0404, there are more than 600 values but most of the them take -1 (>90%), so categorize the rest as "other".
    Features = ['VAR_0493', 'VAR_0404']
    for i in Features:
        Ind = train[i][train[i]!='-1'].index
        train.loc[Ind,i] = 0
        Ind = train[i][train[i]=='-1'].index
        train.loc[Ind,i] = -1
        Ind = test[i][test[i]!='-1'].index
        test.loc[Ind,i] = 0
        Ind = test[i][test[i]=='-1'].index
        test.loc[Ind,i] = -1
        
    return train,test


def Training(train, test, ID, predict):
    
    print('Training XGB model ...')
    
    # Parameters are set based on the gridsearchCV result
    model = xgb.XGBClassifier(
        max_depth=3, 
        learning_rate=0.10,
        objective='binary:logistic', 
        eval_metric='auc',
        n_estimators=100
    )

    model.fit(train.drop(['ID','target'],axis=1), train.target)
    
    # Write to result files
    ID = ID.append(test.ID)
    prob = model.predict_proba(test.drop('ID',axis=1))
    predict = predict.append(pd.DataFrame(prob,columns=['prob1','prob2']).prob2)

    return ID, predict

if __name__ == '__main__':
    
    # Set up data processing pars
    loadsize = 10000
    skipsize = 1
    ID = pd.Series([])
    predict = pd.Series([])
    
    # Process and train data
    while True:
        print('Train and predict data with subset row %s - %s ... ...' %(skipsize, skipsize+loadsize-1))
        train = pd.read_csv('../Data/train.csv',skiprows=range(1,skipsize),nrows=loadsize,low_memory=False)
        test = pd.read_csv('../Data/test.csv',skiprows=range(1,skipsize),nrows=loadsize,low_memory=False)
        train, test = Processing(train,test)
        ID, predict = Training(train, test, ID, predict)
        skipsize += loadsize
        if train.shape[0]<loadsize:
            break
    
    # Write the results into the submission file
    print('Writing submission file ...')
    pd.DataFrame({'ID':ID,'target':predict}).to_csv('submission.csv',index=False)
        
    

Train and predict data with subset row 1 - 10000 ... ...
Processing data ...
Training XGB model ...
Train and predict data with subset row 10001 - 20000 ... ...
Processing data ...
Training XGB model ...
Train and predict data with subset row 20001 - 30000 ... ...
Processing data ...
Training XGB model ...
Train and predict data with subset row 30001 - 40000 ... ...
Processing data ...
Training XGB model ...
Train and predict data with subset row 40001 - 50000 ... ...
Processing data ...
Training XGB model ...
Train and predict data with subset row 50001 - 60000 ... ...
Processing data ...
Training XGB model ...
Train and predict data with subset row 60001 - 70000 ... ...
Processing data ...
Training XGB model ...
Train and predict data with subset row 70001 - 80000 ... ...
Processing data ...
Training XGB model ...
Train and predict data with subset row 80001 - 90000 ... ...
Processing data ...
Training XGB model ...
Train and predict data with subset row 90001 - 100000 ... ...
Proces