In [11]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA

from sklearn import preprocessing

In [8]:
def load_object(filename):
    with open(filename, 'rb')  as f:
        obj = pickle.load(f)
    return obj

In [3]:
data = pd.read_csv('./data/equip_failures_training_set.csv')

## Preprocessing

In [4]:
data = data.replace('na', np.nan) 
data = data.astype(float)
    
X = data.iloc[:,2:]
y = data.target

## Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state = 0)

In [6]:
mean = X_train.mean()
X_train = X_train.fillna(mean)
X_test = X_test.fillna(mean)

In [7]:
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Random Forest

In [12]:
clf_rf = load_object('./model/randomforest.obj')

In [13]:
train_prob_rf = clf_rf.predict_proba(X_train)[:,1]
test_prob_rf = clf_rf.predict_proba(X_test)[:,1]

In [14]:
th = np.linspace(0, 1, 21)
f_score_train = []
f_score_test = []
for t in th:
    pred_train = np.zeros_like(train_prob_rf)
    pred_test = np.zeros_like(test_prob_rf)
    pred_train[train_prob_rf > t] = 1
    pred_test[test_prob_rf > t] = 1
    f_score_train.append(f1_score(y_train, pred_train))
    f_score_test.append(f1_score(y_test, pred_test))
pd.DataFrame({'threshold': th, 'f1_train': f_score_train, 'f1_test': f_score_test})

  'precision', 'predicted', average, warn_for)


Unnamed: 0,threshold,f1_train,f1_test
0,0.0,0.052156,0.052097
1,0.05,0.510706,0.474201
2,0.1,0.607798,0.548851
3,0.15,0.667243,0.605475
4,0.2,0.7126,0.647887
5,0.25,0.763882,0.687023
6,0.3,0.786903,0.73029
7,0.35,0.8059,0.751708
8,0.4,0.814026,0.756098
9,0.45,0.809651,0.774869


In [15]:
th = np.linspace(0.4, 0.6, 21)
f_score_train = []
f_score_test = []
for t in th:
    pred_train = np.zeros_like(train_prob_rf)
    pred_test = np.zeros_like(test_prob_rf)
    pred_train[train_prob_rf > t] = 1
    pred_test[test_prob_rf > t] = 1
    f_score_train.append(f1_score(y_train, pred_train))
    f_score_test.append(f1_score(y_test, pred_test))
pd.DataFrame({'threshold': th, 'f1_train': f_score_train, 'f1_test': f_score_test})

Unnamed: 0,threshold,f1_train,f1_test
0,0.4,0.814026,0.756098
1,0.41,0.808889,0.758621
2,0.42,0.814239,0.764268
3,0.43,0.815738,0.766917
4,0.44,0.814864,0.768041
5,0.45,0.809651,0.774869
6,0.46,0.803789,0.781915
7,0.47,0.795876,0.780488
8,0.48,0.794711,0.782609
9,0.49,0.790402,0.781163


## Ada Boost

In [16]:
clf_boost = load_object('./model/adaboost.obj')

In [18]:
train_prob_boost = clf_boost.predict_proba(X_train)[:,1]
test_prob_boost = clf_boost.predict_proba(X_test)[:,1]

In [19]:
th = np.linspace(0, 1, 21)
f_score_train = []
f_score_test = []
for t in th:
    pred_train = np.zeros_like(train_prob_boost)
    pred_test = np.zeros_like(test_prob_boost)
    pred_train[train_prob_boost > t] = 1
    pred_test[test_prob_boost > t] = 1
    f_score_train.append(f1_score(y_train, pred_train))
    f_score_test.append(f1_score(y_test, pred_test))
pd.DataFrame({'threshold': th, 'f1_train': f_score_train, 'f1_test': f_score_test})

  'precision', 'predicted', average, warn_for)


Unnamed: 0,threshold,f1_train,f1_test
0,0.0,0.032787,0.032787
1,0.05,0.032787,0.032787
2,0.1,0.032787,0.032787
3,0.15,0.032787,0.032787
4,0.2,0.032787,0.032787
5,0.25,0.032787,0.032787
6,0.3,0.032787,0.032787
7,0.35,0.032787,0.032787
8,0.4,0.032803,0.032803
9,0.45,0.038086,0.03807


In [20]:
th = np.linspace(0.4, 0.6, 21)
f_score_train = []
f_score_test = []
for t in th:
    pred_train = np.zeros_like(train_prob_boost)
    pred_test = np.zeros_like(test_prob_boost)
    pred_train[train_prob_boost > t] = 1
    pred_test[test_prob_boost > t] = 1
    f_score_train.append(f1_score(y_train, pred_train))
    f_score_test.append(f1_score(y_test, pred_test))
pd.DataFrame({'threshold': th, 'f1_train': f_score_train, 'f1_test': f_score_test})

  'precision', 'predicted', average, warn_for)


Unnamed: 0,threshold,f1_train,f1_test
0,0.4,0.032803,0.032803
1,0.41,0.032808,0.032806
2,0.42,0.032808,0.032806
3,0.43,0.032998,0.032992
4,0.44,0.036894,0.036897
5,0.45,0.038086,0.03807
6,0.46,0.038115,0.038088
7,0.47,0.055687,0.056342
8,0.48,0.215488,0.218404
9,0.49,0.544887,0.497409
