In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA

from sklearn import preprocessing

In [0]:
data = pd.read_csv('./equip_failures_training_set.csv')

In [0]:
data = data.replace('na', np.nan) 
data = data.astype(float)
    
X = data.iloc[:,2:]
y = data.target

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state = 0)

In [0]:
mean = X_train.mean()
X_train = X_train.fillna(mean)
X_test = X_test.fillna(mean)

In [0]:
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [0]:
#Random Forest
clf_rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 100)
clf_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=100,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
train_pred_rf = clf_rf.predict(X_train)
test_pred_rf = clf_rf.predict(X_test)

In [0]:
f1_train_rf = f1_score(y_train, train_pred_rf)
f1_test_rf = f1_score(y_test, test_pred_rf)
print(f1_train_rf, f1_test_rf)

0.7704447632711622 0.770949720670391


In [0]:
train_prob_rf = clf_rf.predict_proba(X_train)[:,1]
test_prob_rf = clf_rf.predict_proba(X_test)[:,1]

In [0]:
th = np.linspace(0, 1, 21)
f_score_train = []
f_score_test = []
for t in th:
  pred_train = np.zeros_like(train_prob_rf)
  pred_test = np.zeros_like(test_prob_rf)
  pred_train[train_prob_rf > t] = 1
  pred_test[test_prob_rf > t] = 1
  f_score_train.append(f1_score(y_train, pred_train))
  f_score_test.append(f1_score(y_test, pred_test))
pd.DataFrame({'threshold': th, 'f1_train': f_score_train, 'f1_test': f_score_test})

  'precision', 'predicted', average, warn_for)


Unnamed: 0,threshold,f1_train,f1_test
0,0.0,0.051371,0.051322
1,0.05,0.512672,0.468447
2,0.1,0.606107,0.548851
3,0.15,0.667531,0.602564
4,0.2,0.709708,0.643357
5,0.25,0.761062,0.693642
6,0.3,0.790206,0.736402
7,0.35,0.811869,0.747706
8,0.4,0.811594,0.764706
9,0.45,0.80914,0.782152


In [0]:
th = np.linspace(0.4, 0.6, 21)
f_score_train = []
f_score_test = []
for t in th:
  pred_train = np.zeros_like(train_prob_rf)
  pred_test = np.zeros_like(test_prob_rf)
  pred_train[train_prob_rf > t] = 1
  pred_test[test_prob_rf > t] = 1
  f_score_train.append(f1_score(y_train, pred_train))
  f_score_test.append(f1_score(y_test, pred_test))
pd.DataFrame({'threshold': th, 'f1_train': f_score_train, 'f1_test': f_score_test})

Unnamed: 0,threshold,f1_train,f1_test
0,0.4,0.811594,0.764706
1,0.41,0.813299,0.766169
2,0.42,0.817594,0.76962
3,0.43,0.811594,0.765306
4,0.44,0.812,0.770026
5,0.45,0.80914,0.782152
6,0.46,0.801365,0.784
7,0.47,0.795595,0.780488
8,0.48,0.787115,0.786885
9,0.49,0.776437,0.782369


In [0]:
#Logistic Rregression
clf_lr = LogisticRegression(random_state=0, class_weight = 'balanced')
clf_lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
train_pred_lr = clf_lr.predict(X_train)
test_pred_lr = clf_lr.predict(X_test)

In [0]:
f1_train_lr = f1_score(y_train, train_pred_lr)
f1_test_lr = f1_score(y_test, test_pred_lr)
print(f1_train_lr, f1_test_lr)

0.5588020452885318 0.5084269662921348


In [0]:
#Adaboost Classifier
clf_boost = AdaBoostClassifier(n_estimators = 200, random_state = 0, learning_rate = 0.5)
clf_boost.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.5,
                   n_estimators=200, random_state=0)

In [0]:
train_pred_boost = clf_boost.predict(X_train)
test_pred_boost = clf_boost.predict(X_test)

In [0]:
f1_train_boost = f1_score(y_train, train_pred_boost)
f1_test_boost = f1_score(y_test, test_pred_boost)
print(f1_train_boost, f1_test_boost)

In [0]:
train_prob_boost = clf_boost.predict_proba(X_train)[:,1]
test_prob_boost = clf_boost.predict_proba(X_test)[:,1]

In [0]:
th = np.linspace(0, 1, 21)
f_score_train = []
f_score_test = []
for t in th:
  pred_train = np.zeros_like(train_prob_boost)
  pred_test = np.zeros_like(test_prob_boost)
  pred_train[train_prob_boost > t] = 1
  pred_test[test_prob_boost > t] = 1
  f_score_train.append(f1_score(y_train, pred_train))
  f_score_test.append(f1_score(y_test, pred_test))
pd.DataFrame({'threshold': th, 'f1_train': f_score_train, 'f1_test': f_score_test})

  'precision', 'predicted', average, warn_for)


Unnamed: 0,threshold,f1_train,f1_test
0,0.0,0.032787,0.032787
1,0.05,0.032787,0.032787
2,0.1,0.032787,0.032787
3,0.15,0.032787,0.032787
4,0.2,0.032787,0.032787
5,0.25,0.032787,0.032787
6,0.3,0.032787,0.032787
7,0.35,0.032787,0.032787
8,0.4,0.032803,0.032803
9,0.45,0.038086,0.03807


In [0]:
th = np.linspace(0.4, 0.6, 21)
f_score_train = []
f_score_test = []
for t in th:
  pred_train = np.zeros_like(train_prob_boost)
  pred_test = np.zeros_like(test_prob_boost)
  pred_train[train_prob_boost > t] = 1
  pred_test[test_prob_boost > t] = 1
  f_score_train.append(f1_score(y_train, pred_train))
  f_score_test.append(f1_score(y_test, pred_test))
pd.DataFrame({'threshold': th, 'f1_train': f_score_train, 'f1_test': f_score_test})

  'precision', 'predicted', average, warn_for)


Unnamed: 0,threshold,f1_train,f1_test
0,0.4,0.032803,0.032803
1,0.41,0.032808,0.032806
2,0.42,0.032808,0.032806
3,0.43,0.032998,0.032992
4,0.44,0.036894,0.036897
5,0.45,0.038086,0.03807
6,0.46,0.038115,0.038088
7,0.47,0.055687,0.056342
8,0.48,0.215488,0.218404
9,0.49,0.544887,0.497409


In [0]:
th

array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
       0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])

In [0]:
clf_boost_2 = AdaBoostClassifier(base_estimator = n_estimators = 100)

In [0]:
#SVM Classifier
clf_svm = SVC(C = 100, kernel = 'linear', shrinking = False, gamma= 'auto')
clf_svm.fit(X_train, y_train)

In [0]:
train_pred_svm = clf_svm.predict(X_train)
test_pred_svm = clf_svm.predict(X_test)

In [0]:
f1_train_svm = f1_score(y_train, train_pred_svm)
f1_test_svm = f1_score(y_test, test_pred_svm)
print(f1_train_svm, f1_test_svm)

0.9854153455928979 0.706766917293233


In [0]:
test = pd.read_csv('./equip_failures_test_set.csv')

In [0]:
test = test.replace('na', np.nan) 
test = test.astype(float)
    
test = test.iloc[:,1:]

test = test.fillna(mean)

test = scaler.transform(test)

In [0]:
test_p = clf_rf.predict(test)
sub1 = pd.DataFrame({'id': list(range(1,len(test_p)+1)), 'target': np.array(test_p, dtype  = np.int)})
sub1.to_csv('./submission1.csv', index = False)

In [0]:
test_p = clf_boost.predict(test)
sub2 = pd.DataFrame({'id': list(range(1,len(test_p)+1)), 'target': np.array(test_p, dtype  = np.int)})
sub2.to_csv('./submission2.csv', index = False)