In [51]:
import numpy as np  
import matplotlib.pyplot as plt  
import pandas as pd 
import random
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from sklearn.svm import SVC 
from imblearn.over_sampling import RandomOverSampler

In [52]:
df = pd.read_csv('train.csv')
df['Geography'] = df['Geography'].map({'S0':0, 'S1':1, 'S2':2})
df['Gender'] = df['Gender'].map({'Male':0, 'Female':1})

CreditScore = df['CreditScore']
Geography = df['Geography']
Gender = df['Gender']
Age = df['Age']
Tenure = df['Tenure']
Balance = df['Balance']
NumOfProducts = df['NumOfProducts']
HasCrCard = df['HasCrCard']
IsActiveMember = df['IsActiveMember']
EstimatedSalary = df['EstimatedSalary']
Exited = df['Exited']

#define selected features
features = (Geography, Age, Tenure, NumOfProducts, HasCrCard, \
            IsActiveMember)
feature_name = ('Geography', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', \
            'IsActiveMember')

#preprocessing features
scale_feature = {}
for i, d in enumerate(features):
    tmp = np.array(d).astype(float)
    _arr = preprocessing.scale(tmp.reshape(-1, 1))
    scale_feature[feature_name[i]] = _arr

X = np.zeros((len(Age), len(feature_name)))
for i, _feature in enumerate(feature_name):
    for j in range(len(Age)):
        X[j][i] = scale_feature[_feature][j]

y = np.array(Exited.tolist())

In [53]:
#find the dirty data
error_index = [0]*len(list(y))
num_iter = 5
error_time = 5

for iter in range(num_iter):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) 

    y_train = list(y_train)
    class_add_list = [0, 2, 3, 4]
    y_more_class = []
    cnt = 0
    for i, c in enumerate(y_train):
        if c==0:
            pt = cnt%len(class_add_list)
            y_more_class.append(class_add_list[pt])
            cnt+=1
        else:
            y_more_class.append(1)

    y_train =np.array(y_more_class)



    svclassifier = SVC(kernel='rbf', gamma=1.0, C=1.0) 
    svclassifier.fit(X_train, y_train) 

    y_pred = svclassifier.predict(X)  

    for i in range(len(list(y_pred))):
        if y_pred[i]!=1:
            y_pred[i] = 0
            
    for i in range(len(list(y_pred))):
        if y_pred[i]!=y[i] and y[i]==0:
            error_index[i] +=1
            
delete_index = []
for i in range(len(error_index)):
    if error_index[i]>=error_time:
        delete_index.append(i)
print(len(delete_index))

531


In [54]:
#delete the data
df = pd.read_csv('train.csv')
df['Geography'] = df['Geography'].map({'S0':0, 'S1':1, 'S2':2})
df['Gender'] = df['Gender'].map({'Male':0, 'Female':1})

random.shuffle(delete_index)
n = int(len(delete_index)/2)
to_delete_index = []
for i in range(n):
    to_delete_index.append(delete_index[i])
df.drop(to_delete_index, inplace=True)
len(df)

7735

In [55]:
#preprocess again
CreditScore = df['CreditScore']
Geography = df['Geography']
Gender = df['Gender']
Age = df['Age']
Tenure = df['Tenure']
Balance = df['Balance']
NumOfProducts = df['NumOfProducts']
HasCrCard = df['HasCrCard']
IsActiveMember = df['IsActiveMember']
EstimatedSalary = df['EstimatedSalary']
Exited = df['Exited']

features = (Geography, Age, Tenure, NumOfProducts, HasCrCard, \
            IsActiveMember)
feature_name = ('Geography', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', \
            'IsActiveMember')

scale_feature = {}
for i, d in enumerate(features):
    tmp = np.array(d).astype(float)
    _arr = preprocessing.scale(tmp.reshape(-1, 1))
    scale_feature[feature_name[i]] = _arr

X = np.zeros((len(Age), len(feature_name)))
for i, _feature in enumerate(feature_name):
    for j in range(len(Age)):
        X[j][i] = scale_feature[_feature][j]

y = np.array(Exited.tolist())

In [65]:
#training
for i in range(1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) 

    y_train = list(y_train)
    class_add_list = [0, 2, 3, 4]
    y_more_class = []
    cnt = 0
    for i, c in enumerate(y_train):
        if c==0:
            pt = cnt%len(class_add_list)
            y_more_class.append(class_add_list[pt])
            cnt+=1
        else:
            y_more_class.append(1)

    y_train =np.array(y_more_class)

    svclassifier = SVC(kernel='rbf', gamma=1.0, C=1.0) 
    svclassifier.fit(X_train, y_train) 

    y_pred = svclassifier.predict(X_test)  

    for i in range(len(list(y_pred))):
        if y_pred[i]!=1:
            y_pred[i] = 0
    print(classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      1234
           1       0.60      0.66      0.63       313

   micro avg       0.84      0.84      0.84      1547
   macro avg       0.75      0.77      0.76      1547
weighted avg       0.85      0.84      0.84      1547



In [66]:
#load the test data and preprocess 
df_test = pd.read_csv('test.csv')
df_test['Geography'] = df_test['Geography'].map({'S0':0, 'S1':1, 'S2':2})
df_test['Gender'] = df_test['Gender'].map({'Male':0, 'Female':1})

CreditScore = df_test['CreditScore']
Geography = df_test['Geography']
Gender = df_test['Gender']
Age = df_test['Age']
Tenure = df_test['Tenure']
Balance = df_test['Balance']
NumOfProducts = df_test['NumOfProducts']
HasCrCard = df_test['HasCrCard']
IsActiveMember = df_test['IsActiveMember']
EstimatedSalary = df_test['EstimatedSalary']

features = (Geography, Age, Tenure, NumOfProducts, HasCrCard, \
            IsActiveMember)
feature_name = ('Geography', 'Age', 'Tenure', 'NumOfProducts', 'HasCrCard', \
            'IsActiveMember')

scale_feature = {}
for i, d in enumerate(features):
    tmp = np.array(d).astype(float)
    _arr = preprocessing.scale(tmp.reshape(-1, 1))
    scale_feature[feature_name[i]] = _arr

X_final = np.zeros((len(Age), len(feature_name)))
for i, _feature in enumerate(feature_name):
    for j in range(len(Age)):
        X_final[j][i] = scale_feature[_feature][j]

In [68]:
#rewrite the upload file
df_sample = pd.read_csv('sample_upload.csv')
df_sample['Exited'] = y_final
df_sample.to_csv('to_upload.csv', index=False, sep=',')
df_check = pd.read_csv('to_upload.csv')
df_check[:20]

Unnamed: 0.1,Unnamed: 0,RowNumber,Exited
0,0,2209,0
1,1,9924,0
2,2,4617,0
3,3,6077,0
4,4,9240,1
5,5,4834,0
6,6,8523,0
7,7,2826,0
8,8,871,0
9,9,6698,0
