In [98]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

In [99]:
def load_data():
    global y_train
    global x_train
    global x_test
    print("Load the data.")
    df_x_train = pd.read_csv('X_train.csv',header = 0 ,index_col=0)
    df_y_train = pd.read_csv('y_train.csv',header = 0 ,index_col=0)
    df_x_test = pd.read_csv('X_test.csv',header = 0 ,index_col=0)
    x_train = df_x_train.values
     
    y_train = df_y_train['y'].values
    x_test = df_x_test.values

    # standardization
    print('Standardize the data.')
    scaler = preprocessing.StandardScaler().fit(x_train)
    
    x_train =scaler.transform(x_train)
    
    x_test = scaler.transform(x_test)

In [100]:
# print('Check training data.')
# print(df_x_train.head())
# print(df_x_train.shape)
# print('There are %d NAN values.'%np.sum(np.isnan(x_train)))
# print(df_y_train.groupby('y').size())



In [101]:
# print("Split data into training and validation sets.")
# x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2,stratify=y_train)

In [102]:
#feature selection
from sklearn.feature_selection import SelectFromModel, SelectKBest,f_classif,mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression

def feature_selection(x_train, y_train, x_test, method="rf", max_features=200):
    if(method == 'rf'):
        sel = SelectFromModel(RandomForestClassifier(n_estimators=100,class_weight='balanced'), threshold=-np.inf, max_features=max_features)
        sel.fit(x_train, y_train)
        indicator = sel.get_support()
        index = np.where(indicator==False)[0]

        print("select %d features!" % (indicator.shape[0] - index.shape[0]))
        x_train_fs = np.delete(x_train, index, axis=1)
        x_test_fs = np.delete(x_test, index, axis=1)
        return x_train_fs, x_test_fs
    
    elif(method == 'xgb'):
        sel = SelectFromModel(XGBRegressor(n_estimators=100, learning_rate=0.1, gamma=0.01, subsample=0.8, colsample_bytree=1, min_child_weight=5, max_depth=3), threshold=-np.inf, max_features=max_features)
        sel.fit(x_train, y_train)
        indicator = sel.get_support()
        index = np.where(indicator==False)[0]

        print("select %d features!" % (indicator.shape[0] - index.shape[0]))
        x_train_fs = np.delete(x_train, index, axis=1)
        x_test_fs = np.delete(x_test, index, axis=1)
        return x_train_fs, x_test_fs

    elif(method == 'freg'):
        #x_train = SelectKBest(f_regression, k=200).fit_transform(x_train, y_train)
        sel = SelectKBest(f_classif, k=max_features)
        sel.fit(x_train, y_train)
        indicator = sel.get_support()
        index = np.where(indicator==False)[0]

        print("select %d features!" % (indicator.shape[0] - index.shape[0]))
        x_train_fs = np.delete(x_train, index, axis=1)
        x_test_fs = np.delete(x_test, index, axis=1)
        return x_train_fs, x_test_fs
    
    elif(method == 'mir'):
        sel = SelectKBest(mutual_info_classif, k=max_features)
        sel.fit(x_train, y_train)
        indicator = sel.get_support()
        index = np.where(indicator==False)[0]

        print("select %d features!" % (indicator.shape[0] - index.shape[0]))
        x_train_fs = np.delete(x_train, index, axis=1)
        x_test_fs = np.delete(x_test, index, axis=1)
        return x_train_fs, x_test_fs
    
    elif(method == 'logr'):
        sel = SelectFromModel(LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000))
        sel.fit(x_train, y_train)
        indicator = sel.get_support()
        index = np.where(indicator==False)[0]

        print("select %d features!" % (indicator.shape[0] - index.shape[0]))
        x_train_fs = np.delete(x_train, index, axis=1)
        x_test_fs = np.delete(x_test, index, axis=1)
        return x_train_fs, x_test_fs

In [103]:
# remove outliers
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
def outlier_detection(x_train, y_train, method='isoforest'):
    if(method == 'isoforest'):
        rng = np.random.RandomState(42)
        clf = IsolationForest(behaviour='new', max_samples=1000,
                          random_state=rng, 
                              contamination='auto')
        clf.fit(x_train)
        indicator = clf.predict(x_train)
        index = np.where(indicator == -1)[0]
        print("detect %d outliers in training set!" % (index.shape[0]))

        x_train_clean = np.delete(x_train, index, axis=0)
        y_train_clean = np.delete(y_train, index, axis=0)

        return x_train_clean, y_train_clean
    
    elif(method == 'lof'):
        clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
        clf.fit(x_train)
        indicator = clf.fit_predict(x_train)
        index = np.where(indicator == -1)[0]
        print("detect %d outliers in training set!" % (index.shape[0]))

        x_train_clean = np.delete(x_train, index, axis=0)
        y_train_clean = np.delete(y_train, index, axis=0)
        return x_train_clean,y_train_clean
    


In [104]:
# SVC
from sklearn.svm import SVC
# clf = SVC(C=3.0,class_weight=None, 
#     decision_function_shape='ovr',  gamma='auto',
#     max_iter=-1, probability=False, shrinking=True,
#     tol=0.001, verbose=False)
# clf.fit(x_train, y_train)
# y_val_p = clf.predict(x_val)
# val_result = balanced_accuracy_score(y_val_p,y_val)
# print(val_result)
# y_pred = clf.predict(fs_x_test)
# print(y_pred)

In [105]:
# subsampling

from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import RandomUnderSampler
n=20
for i in range(n):
    load_data()
    np.random.seed(i)
#     model_smote = SMOTE()
#     x_over_train, y_over_train = model_smote.fit_sample(x_train,y_train)

    model_RandomUnderSampler = RandomUnderSampler() 
    x_down_train, y_down_train =model_RandomUnderSampler.fit_sample(x_train,y_train)
#     print('After over-sampling:\n',pd.DataFrame(y_over_train,columns=['y']).groupby('y').size())
    print('After down-sampling:\n',pd.DataFrame(y_down_train,columns=['y']).groupby('y').size())

    fs_x_down_train,fs_x_test = feature_selection(x_down_train,y_down_train,x_test,"logr",500)
    x_train, x_val, y_train, y_val = train_test_split(fs_x_down_train, y_down_train, test_size=0.005)
    x_train,y_train = outlier_detection(x_train,y_train,method='isoforest')


    clf = SVC(C=3.0,class_weight=None, 
    decision_function_shape='ovr',  gamma='auto',
    max_iter=-1, probability=False, shrinking=True,
    tol=0.001, verbose=False,random_state=i)
    clf.fit(x_train, y_train)
    y_val_p = clf.predict(x_val)
    val_result = balanced_accuracy_score(y_val_p,y_val)
    print(val_result)
    y_pred = clf.predict(fs_x_test)
    print(y_pred)
    
    f = open("submission_svc{0:.1f}.csv".format(i), "w")
    f.write("id,y\n")
    for i,x in enumerate(y_pred):
        f.write("{},{}\n".format(i,x))
    f.close()
    

Load the data.
Standardize the data.
After down-sampling:
 y
0    600
1    600
2    600
dtype: int64
select 450 features!
detect 84 outliers in training set!
0.75
[1 0 1 ... 1 0 2]
Load the data.
Standardize the data.
After down-sampling:
 y
0    600
1    600
2    600
dtype: int64
select 457 features!
detect 76 outliers in training set!
0.6666666666666666
[1 0 1 ... 1 0 2]
Load the data.
Standardize the data.
After down-sampling:
 y
0    600
1    600
2    600
dtype: int64
select 465 features!
detect 73 outliers in training set!
0.7222222222222222
[1 0 1 ... 1 0 1]
Load the data.
Standardize the data.
After down-sampling:
 y
0    600
1    600
2    600
dtype: int64
select 442 features!
detect 62 outliers in training set!
0.7777777777777777
[1 0 1 ... 1 0 2]
Load the data.
Standardize the data.
After down-sampling:
 y
0    600
1    600
2    600
dtype: int64
select 443 features!
detect 82 outliers in training set!
0.7666666666666666
[1 0 1 ... 1 0 1]
Load the data.
Standardize the data.
Af



0.75
[1 0 1 ... 1 0 2]
Load the data.
Standardize the data.
After down-sampling:
 y
0    600
1    600
2    600
dtype: int64
select 455 features!
detect 59 outliers in training set!
0.7777777777777777
[1 0 1 ... 1 0 2]


In [106]:


result = [list() for i in range(4100)]


for i in range(n):
    f = "submission_svc%d.0.csv" %(i)
    f = open(f)
    lines = f.readlines()
    for l in lines[1:]:
        l = l.strip().split(',')
        idx, val = int(l[0]), int(float(l[1]))
        result[idx].append(val)
    f.close()
print(result[:20])


def vote(x):
    c = [0] * 3
    for i in x:
        c[i] += 1
    c = [(0, c[0]), (1, c[1]), (2, c[2])]
    c.sort(key = lambda x: x[1], reverse = True)
    if c[0][1] > c[1][1]:
        return c[0][0]
    else:
        if c[0][0] == 1 or c[1][0] == 1:
            return 1
        else:
            return np.random.choice([c[0][0], c[1][0]])
print(vote([1,2,0,0,0]))


with open("voted.csv", "w") as f:
    f.write("id,y\n")
    for i in range(4100):
        f.write("{},{}\n".format(i, vote(result[i])))
    f.close()


[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 