In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## Preprocessing

In [3]:
def train_preprocess(df):
    # Drop id column
    df = df.drop("id",axis = 1)
    # Drop Nan
    df = df.dropna()
    # Divide X,Y
    df_X = df.iloc[:,:-1]
    df_Y = df.iloc[:,-1]
    # Normalization
    train_mean = df_X.mean()
    train_std = df_X.std()
    df_X = (df_X-train_mean)/train_std
    return df_X,df_Y,train_mean,train_std

In [4]:
train = pd.read_csv("../data/train.csv")
train_X,train_Y,train_mean,train_std = train_preprocess(train)

In [5]:
train_Y

0     1
1     0
2     0
3     0
4     0
     ..
82    0
83    0
84    1
85    1
86    1
Name: label, Length: 86, dtype: int64

## Model Selection

In [7]:
parameters = {'n_estimators':np.linspace(100,3000,30).astype(int),'max_features': [2,3,4]}
grid_search = GridSearchCV(RandomForestClassifier(),parameters,scoring = 'accuracy',n_jobs = 4)
grid_search.fit(train_X, train_Y)
print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'max_features': 4, 'n_estimators': 1900}
best scrores:  0.9307189542483659


## Train

In [8]:
def trainingRF(X,Y):
#     rf = RandomForestClassifier(n_estimators = 200, max_features = 3)
    rf = RandomForestClassifier(n_estimators = 1900, max_features = 4)
    rf.fit(X,Y)
    return rf

In [9]:
trained_rf = trainingRF(train_X,train_Y)

## Predict

In [10]:
def test_preprocess(df,mean,std):
    # Drop id column
    df = df.drop("id",axis = 1)
    # Normalization
    df = (df-mean)/std
    return df

In [11]:
test = pd.read_csv("test.csv")
test_X = test_preprocess(test,train_mean,train_std)
pre_Y = trained_rf.predict(test_X)
# write into csv
ids = np.array(test['id'])
ids = np.expand_dims(ids,axis = 1)
pre_Y = np.expand_dims(pre_Y,axis = 1)
res = np.concatenate((ids,pre_Y),axis = 1)
res = pd.DataFrame(res,columns = ['id','label'])
res.to_csv('./submission.csv',index = False)