# Import

In [281]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn import tree, metrics, ensemble
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import tensorflow as tf
from sklearn.utils import shuffle
from keras import Sequential
from keras import regularizers
from keras.layers import Dense, Dropout
import random

from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LogisticRegression

from scipy import interp
from itertools import cycle
from sklearn.preprocessing import label_binarize

import csv
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

# For evaluation(first)

In [91]:
#accuracy
def acc(tol,label,pred):
    correct=list()
    for i in range(len(label)):
        t=(abs(list(label)[i]-list(pred)[i])<=tol)
        correct.append(t)
    c=np.array(correct).astype('int')
    return np.mean(c)

In [3]:
#print acc
def evalacc(model,x_train,x_test,y_train,y_test):
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)

    #train score
    print('train accuracy score: ',acc(1,y_train,y_train_pred))

    #test score
    print('test accuracy score: ',acc(1,y_test,y_test_pred))

In [4]:
#accuracy & confusion matrix
def evaluation(model,x_train,x_test,y_train,y_test):
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)

    #train score
    print('train accuracy score: ',acc(1,y_train,y_train_pred))

    #test score
    print('test accuracy score: ',acc(1,y_test,y_test_pred))

    #confusion matrix
    print('train confusion matrix:\n',metrics.confusion_matrix(y_train,y_train_pred))
    print('test confusion matrix:\n',metrics.confusion_matrix(y_test,y_test_pred))


# Data

In [6]:
data_train = pd.read_csv('./train_set.csv',encoding='gb18030')
data_test = pd.read_csv('./test_set.csv',encoding='gb18030')

X_train = data_train.iloc[:, 2:14].fillna(0).astype(float)
y_train = data_train.iloc[:, 0].fillna(0).astype(int)
X_test = data_test.iloc[:, 2:14].fillna(0).astype(float)
y_test = data_test.iloc[:, 0].fillna(0).astype(int)


# Base learners

## Decision tree

In [7]:
dtree=tree.DecisionTreeClassifier(criterion='gini', max_depth=15, min_samples_leaf=5,random_state=42)
dtree.fit(X_train, y_train)

## Neutral network

In [8]:
mlp=MLPClassifier(hidden_layer_sizes=(400,200,100,), activation='logistic',  solver='adam', alpha=0.0001, batch_size=40, learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=42, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000)
mlp.fit(X_train,y_train)

## logit

In [16]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=1000,tol=0.1,random_state=42)
logreg.fit(X_train, y_train)

# Heterogeneous

## Soft voting

In [17]:
#vote soft
vsoft = ensemble.VotingClassifier(estimators=[('dtree',dtree ),('lr', logreg),('mlp',mlp)],voting='soft')
vsoft.fit(X_train,y_train)

In [18]:
evaluation(vsoft,X_train,X_test,y_train,y_test)

train accuracy score:  0.9191279887482419
test accuracy score:  0.6882022471910112
train confusion matrix:
 [[ 15   0   0   0   0   0   0   0   0   0   0   0   0   0   0   4   3   0
    0]
 [  0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
    0]
 [  0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   4   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   1   0
    0]
 [  1   0   0   0   0   0   0   4   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   3   3   0
    0]
 [  0   0   0   0   0   0   0   0   0   3   0   0   0   0   0   1   4   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   2   0   0   0   2   1   1   0
    2]
 

## Hard voting

In [19]:
#vote hard
vhard = ensemble.VotingClassifier(estimators=[('dtree',dtree ),('lr', logreg),('mlp',mlp)],voting='hard')
vhard.fit(X_train,y_train)

In [20]:
evaluation(vhard,X_train,X_test,y_train,y_test)

train accuracy score:  0.8509142053445851
test accuracy score:  0.6320224719101124
train confusion matrix:
 [[ 14   0   0   0   0   0   0   0   0   0   0   0   0   0   2   1   4   0
    1]
 [  0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
    0]
 [  0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   4   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   1   0
    0]
 [  1   0   0   0   0   1   0   3   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   3   0   0   0   0   0   0   1   3   0
    0]
 [  0   0   0   0   0   0   0   0   0   4   0   0   0   0   0   2   2   0
    0]
 [  0   0   0   0   0   0   0   1   0   0   1   0   0   1   1   0   2   0
    2]
 

## Stacking

In [21]:
stk=ensemble.StackingClassifier(estimators=[('dtree',dtree ),('lr', logreg),('mlp',mlp)])
stk.fit(X_train,y_train)

In [22]:
evaluation(stk,X_train,X_test,y_train,y_test)

train accuracy score:  0.8319268635724332
test accuracy score:  0.702247191011236
train confusion matrix:
 [[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  13   5   4
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   1   0
    1]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   4   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   3   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   5   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   7   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3   4   0
    1]
 [

# Homogeneous

## Bagging

In [23]:
bag=ensemble.BaggingClassifier(tree.DecisionTreeClassifier(),random_state=42)
bag.fit(X_train,y_train)

In [24]:
evaluation(bag,X_train,X_test,y_train,y_test)

train accuracy score:  0.9915611814345991
test accuracy score:  0.6882022471910112
train confusion matrix:
 [[ 22   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   4   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   5   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   7   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0
    0]
 

## Random forest

### default

In [25]:
rfd=ensemble.RandomForestClassifier(oob_score=True,random_state=42)
rfd.fit(X_train,y_train)

In [26]:
evaluation(rfd,X_train,X_test,y_train,y_test)

train accuracy score:  1.0
test accuracy score:  0.75
train confusion matrix:
 [[ 22   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   4   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   5   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   7   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0 

### better

In [37]:
best_accuracy=0
for i in [15,20,30,40,50,60,100]:
    for j in [1,3,5,7,9,11,13,15,17]:
        accuracy_rf= []
        rf=ensemble.RandomForestClassifier(oob_score=True,random_state=42,max_depth=i, min_samples_leaf=j)
        rf.fit(X_train, y_train)
        #print('depth=',i,'min sample=',j)
        y_pred=rf.predict(X_test)
        accuracy_rf=acc(1,y_test,y_pred)
        #print(accuracy_rf)
        if best_accuracy<accuracy_rf:
            best_accuracy=acc(1,y_test,y_pred)
            best_depth=i
            best_min_sample=j
            
print('best depth:',best_depth)
print('best min sample:',best_min_sample)
print('best accuracy:',best_accuracy)

best depth: 30
best min sample: 5
best accuracy: 0.7556179775280899


In [73]:
rf=ensemble.RandomForestClassifier(oob_score=True,random_state=42,max_depth=30, min_samples_leaf=5)
rf.fit(X_train,y_train)

In [74]:
evaluation(rf,X_train,X_test,y_train,y_test)

train accuracy score:  0.8277074542897328
test accuracy score:  0.7556179775280899
train confusion matrix:
 [[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   5  11   0
    6]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   2   1
    0]
 [  0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   1   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
    1]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   1   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   4   3   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3   5   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3   4   0
    1]
 

## Extra trees

### default

In [38]:
etc=ensemble.ExtraTreesClassifier()
etc.fit(X_train,y_train)

In [39]:
evaluation(etc,X_train,X_test,y_train,y_test)

train accuracy score:  1.0
test accuracy score:  0.7106741573033708
train confusion matrix:
 [[ 22   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   4   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   5   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   7   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0
    0]
 [  0   0   0   

### better

In [41]:
best_accuracy=0
for i in [15,20,30,40,50,60,100]:
    for j in [1,3,5,7,9,11,13,15,17]:
        accuracy_etc= []
        etc=ensemble.ExtraTreesClassifier(random_state=42,max_depth=i, min_samples_leaf=j)
        etc.fit(X_train, y_train)
        #print('depth=',i,'min sample=',j,'n estimator=',k)
        y_pred=etc.predict(X_test)
        accuracy_etc=acc(1,y_test,y_pred)
        #print(accuracy_etc)
        if best_accuracy<accuracy_etc:
            best_accuracy=acc(1,y_test,y_pred)
            best_depth=i
            best_min_sample=j
print('best depth:',best_depth)
print('best min sample:',best_min_sample)
print('best accuracy:',best_accuracy)

best depth: 40
best min sample: 1
best accuracy: 0.7275280898876404


In [46]:
etc=ensemble.ExtraTreesClassifier(random_state=42,max_depth=40, min_samples_leaf=1)
etc.fit(X_train,y_train)

In [47]:
evaluation(etc,X_train,X_test,y_train,y_test)

train accuracy score:  1.0
test accuracy score:  0.7275280898876404
train confusion matrix:
 [[ 22   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   4   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   5   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   7   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0
    0]
 [  0   0   0   

## Adaboost

### Default

In [49]:
adad=ensemble.AdaBoostClassifier(random_state=42)
adad.fit(X_train,y_train)

In [50]:
evaluation(adad,X_train,X_test,y_train,y_test)

train accuracy score:  0.6779184247538678
test accuracy score:  0.651685393258427
train confusion matrix:
 [[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1  20   0
    1]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   4   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   3   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   7   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   7   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   7   0
    0]
 [

### better

In [51]:
best_accuracy=0
for i in [3,5,10,15,20,30,40,50,60]:
    for j in [1,2,3,5,7,9,11,13,15,17]:
        accuracy_ada= []
        ada=ensemble.AdaBoostClassifier(random_state=42,estimator=tree.DecisionTreeClassifier(criterion='gini', max_depth=i, min_samples_leaf=j))
        ada.fit(X_train, y_train)
        #print('depth=',i,'min sample=',j)
        y_pred=ada.predict(X_test)
        accuracy_ada=acc(1,y_test,y_pred)
        #print(accuracy_ada)
        if best_accuracy<accuracy_ada:
            best_accuracy=accuracy_ada
            best_depth=i
            best_min_sample=j
print('best depth:',best_depth)
print('best min sample:',best_min_sample)
print('best accuracy:',best_accuracy)

best depth: 20
best min sample: 17
best accuracy: 0.7471910112359551


In [75]:
ada=ensemble.AdaBoostClassifier(random_state=42,estimator=tree.DecisionTreeClassifier(criterion='gini', max_depth=20, min_samples_leaf=17))
ada.fit(X_train,y_train)

In [76]:
evaluation(ada,X_train,X_test,y_train,y_test)

train accuracy score:  0.9486638537271449
test accuracy score:  0.7471910112359551
train confusion matrix:
 [[ 19   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   0
    1]
 [  0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   4   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   5   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   7   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   6   0   0   0   0   0   0   2   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   7   0   0   0   1   0   0   0
    0]
 

## GBDT

### default

In [54]:
gbdtd=ensemble.GradientBoostingClassifier(random_state=42)
gbdtd.fit(X_train,y_train)

In [55]:
evaluation(gbdtd,X_train,X_test,y_train,y_test)

train accuracy score:  0.9416315049226441
test accuracy score:  0.7050561797752809
train confusion matrix:
 [[ 22   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   4   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   5   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   7   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0
    0]
 

### better

In [56]:
best_accuracy=0
for i in [10,20,30,40]:
    for j in [1,5,9,13,17]:
        accuracy_gbdt=[]
        gbdt=ensemble.GradientBoostingClassifier(random_state=42,max_depth=i, min_samples_leaf=j)
        gbdt.fit(X_train, y_train)
        y_pred=gbdt.predict(X_test)
        accuracy_gbdt=acc(1,y_test,y_pred)
        print('depth=',i,'min sample=',j,':',accuracy_gbdt)
        if best_accuracy<accuracy_gbdt:
            best_accuracy=acc(1,y_test,y_pred)
            best_depth=i
            best_min_sample=j

depth= 10 min sample= 1 : 0.699438202247191
depth= 10 min sample= 5 : 0.7050561797752809
depth= 10 min sample= 9 : 0.7050561797752809
depth= 10 min sample= 13 : 0.7191011235955056
depth= 10 min sample= 17 : 0.6938202247191011
depth= 20 min sample= 1 : 0.648876404494382
depth= 20 min sample= 5 : 0.7050561797752809
depth= 20 min sample= 9 : 0.7134831460674157
depth= 20 min sample= 13 : 0.699438202247191
depth= 20 min sample= 17 : 0.6825842696629213
depth= 30 min sample= 1 : 0.601123595505618
depth= 30 min sample= 5 : 0.6966292134831461
depth= 30 min sample= 9 : 0.7078651685393258
depth= 30 min sample= 13 : 0.6910112359550562
depth= 30 min sample= 17 : 0.6853932584269663
depth= 40 min sample= 1 : 0.598314606741573
depth= 40 min sample= 5 : 0.699438202247191
depth= 40 min sample= 9 : 0.6966292134831461
depth= 40 min sample= 13 : 0.7050561797752809
depth= 40 min sample= 17 : 0.7162921348314607


In [57]:
print('best depth:',best_depth)
print('best min sample:',best_min_sample)
print('best accuracy:',best_accuracy)

best depth: 10
best min sample: 13
best accuracy: 0.7191011235955056


In [58]:
gbdt=ensemble.GradientBoostingClassifier(random_state=42,max_depth=10, min_samples_leaf=13)
gbdt.fit(X_train,y_train)

In [59]:
evaluation(gbdt,X_train,X_test,y_train,y_test)

train accuracy score:  1.0
test accuracy score:  0.7191011235955056
train confusion matrix:
 [[ 22   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   4   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   5   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   7   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0   0
    0]
 [  0   0   0   

# Classification & evaluation(final)

In [408]:
#rating
rating=dict()
rating[1]=[1,2,3]#C CC CCC
rating[2]=[4,5,6]#B
rating[3]=[7,8,9]#BB
rating[4]=[10,11,12]#BBB
rating[5]=[13,14,15]#A
rating[6]=[16,17]#AA
rating[7]=[18,19]#AAA

In [None]:
'''#rating
rating=dict()
rating[1]=[1,2,3]#C CC CCC
rating[2]=[4,5,6,7,8,9,10,11,12]#B BB BBB
rating[3]=[13,14,15]#A
rating[4]=[16,17]#AA
rating[5]=[18,19]#AAA'''

In [405]:
#classification
def cla(y):
    cl=[]
    for i in range(len(y)):
        for r in list(rating.keys()):
            if y[i] in rating[r]:
                c=r
        cl.append(c)
    return(cl)

In [324]:
#weighted accuracy(not tol but classification)
def wcacc(label,pred):
    correct=list()
    total=0
    for i in range(len(label)):
        for r in list(rating.keys()):
            if list(label)[i]==r:
                if r==1:
                    t=50*(list(pred)[i]==r)
                    total+=10
                else:
                    t=1*(list(pred)[i]==r)
                    total+=1
        correct.append(t)
    c=np.array(correct).astype('int')
    m=np.sum(c)/total
    return m

In [279]:
#punish
def pcacc(label,pred):
    correct=list()
    for i in range(len(label)):
        for r in list(rating.keys()):
            if list(label)[i]==r:
                if list(pred)[i]==r:
                    t=1
                elif abs(list(label)[i]-list(pred)[i])==1:
                    t=0.1
                elif abs(list(label)[i]-list(pred)[i])>=2:
                    t=1/100000000000000000000000000000000000000000000000
                else:
                    t=1/1000000000000000000000000
        correct.append(t)
    c=np.array(correct).astype('int')
    m=np.mean(c)
    return m

In [349]:
#weighted+punish
def pwacc(label,pred):
    correct=list()
    for i in range(len(label)):
        for r in list(rating.keys()):
            if r==1:
                if list(label)[i]==1:
                    if list(pred)[i]==1:
                        t=30
                    elif list(pred)[i]>=5:
                        t=-100
                    else:
                        t=0
            else:
                if list(label)[i]==r:
                    if list(pred)[i]==r:
                        t=0
                    elif list(pred)[i]==1:
                        t=0
                    else:
                        t=0
        correct.append(t)
    c=np.array(correct).astype('int')
    m=np.mean(c)
    return m

In [452]:
# c-accuracy
def cc(label,pred):
    correct=list()
    for i in range(len(label)):
        if list(label)[i]==1:
            if list(pred)[i]==1:
                t=10
            elif list(pred)[i] in [2,3,4]:
                t=0
            else:
                t=-1
            correct.append(t)
    c=np.array(correct).astype('int')
    m=np.mean(c)
    return m

In [453]:
# c-a
def cnoa(label,pred):
    correct=list()
    for i in range(len(label)):
        if list(label)[i]==1:
            if list(pred)[i]>=5:
                t=-1
            elif list(pred)[i] in [2,3,4]:
                t=-0.5
            else:
                t=0
            correct.append(t)
    c=np.array(correct).astype('int')
    m=np.mean(c)
    return m

In [293]:
def cevaluation(model,x_train,x_test,y_train,y_test):
    cy_train=cla(y_train)
    cy_test=cla(y_test)
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)

    #train score
    print('train accuracy score: ',metrics.accuracy_score(cy_train,y_train_pred))
    print('balanced train accuracy score: ',metrics.balanced_accuracy_score(cy_train,y_train_pred))

    #test score
    print('test accuracy score: ',metrics.accuracy_score(cy_test,y_test_pred))
    print('balanced test accuracy score: ',metrics.balanced_accuracy_score(cy_test,y_test_pred))

    #confusion matrix
    print('train confusion matrix:\n',metrics.confusion_matrix(cy_train,y_train_pred))
    print('test confusion matrix:\n',metrics.confusion_matrix(cy_test,y_test_pred))


In [311]:
def ccm(model,x_train,x_test,y_train,y_test):
    cy_train=cla(y_train)
    cy_test=cla(y_test)
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)

    #confusion matrix
    print('train confusion matrix:\n',metrics.confusion_matrix(cy_train,y_train_pred))
    print('test confusion matrix:\n',metrics.confusion_matrix(cy_test,y_test_pred))


# Final model

In [409]:
l=list(rating.keys())
c=Counter(np.array(cla(y_train)))
w={}
for i in dict(c).keys():
    w[i]=100/c[i]
print(w)

{6: 0.14925373134328357, 7: 0.20618556701030927, 5: 0.5291005291005291, 3: 7.142857142857143, 4: 3.5714285714285716, 1: 3.5714285714285716, 2: 12.5}


In [459]:
best_accuracy=-1000*c[1]
for i1 in [20,30]:
    for i2 in [30,35]:
        for i3 in [10,20]:
            for i4 in [15,20]:
                for i5 in [5,10]:
                    accuracy_rf= []
                    rf=ensemble.RandomForestClassifier(oob_score=True,random_state=42,max_depth=100, min_samples_leaf=20,\
                                                    class_weight={1:i1,2:i2,3:i3,4:i4,5:i5})
                    rf.fit(X_train, cla(y_train))
                    y_pred=rf.predict(X_test)
                    accuracy_rf=cnoa(cla(y_test),y_pred)
                    if best_accuracy<accuracy_rf:
                        best_accuracy=accuracy_rf
                        best_i1=i1
                        best_i2=i2
                        best_i3=i3
                        best_i4=i4
                        best_i5=i5
print('best i1:',best_i1)
print('best i2:',best_i2)
print('best i3:',best_i3)
print('best i4:',best_i4)
print('best i5:',best_i5)


print('best accuracy:',best_accuracy)

best i1: 30
best i2: 30
best i3: 10
best i4: 15
best i5: 5
best accuracy: -0.2


In [460]:
rf=ensemble.RandomForestClassifier(oob_score=True,random_state=42,max_depth=100, min_samples_leaf=20,\
                                   class_weight={1:30,2:30,3:10,4:15,5:5,6:1,7:1})
rf.fit(X_train,cla(y_train))

In [461]:
ccm(rf,X_train,X_test,y_train,y_test)

train confusion matrix:
 [[ 27   0   0   0   1   0   0]
 [  1   6   0   0   1   0   0]
 [  2   2   0   0   4   5   1]
 [  3   2   0  13   9   1   0]
 [ 18   6   0   0 155   6   4]
 [104  11   0   7 257 235  56]
 [ 49   6   0   4  61  85 280]]
test confusion matrix:
 [[ 7  1  0  0  1  1  0]
 [ 1  0  0  0  0  0  1]
 [ 2  0  0  0  0  0  0]
 [ 2  0  0  0  2  1  1]
 [ 7  3  0  0 35  6  4]
 [27  5  0  1 69 49 21]
 [20  3  0  0 13 14 59]]


# Output

In [470]:
norate=pd.read_csv('nonono.csv',encoding='gbk')
Xoutput=norate.iloc[:,4:16].fillna(0).astype(float).to_numpy() 
idoutput=norate.iloc[:,0].to_numpy()
nameoutput=norate.iloc[:,1].to_numpy()

In [471]:
finalmodel=rf
y_pred_output=finalmodel.predict(Xoutput)


In [472]:
name='finalpred'+'.csv'
with open(name,"w",newline='') as csvfile: 
    writer = csv.writer(csvfile)
    writer.writerow(["id","name","pred"])
    for i in range(0,len(y_pred_output)):
        writer.writerow([idoutput[i],nameoutput[i],y_pred_output[i]])