In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA

from sklearn import preprocessing
import pickle

### Data Importing

In [4]:
data = pd.read_csv('./data/equip_failures_training_set.csv')

In [5]:
data = data.replace('na', np.nan) 
data = data.astype(float) #convert object type to 
    
X = data.iloc[:,2:]
y = data.target

### Train Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state = 0)

### Imputation

In [7]:
mean = X_train.mean()
X_train = X_train.fillna(mean)
X_test = X_test.fillna(mean)

### Normalization

In [8]:
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Logistic Regression

In [12]:
#Logistic Rregression
clf_lr = LogisticRegression(random_state=0, class_weight = 'balanced')
clf_lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [13]:
train_pred_lr = clf_lr.predict(X_train)
test_pred_lr = clf_lr.predict(X_test)

In [14]:
f1_train_lr = f1_score(y_train, train_pred_lr)
f1_test_lr = f1_score(y_test, test_pred_lr)
print(f1_train_lr, f1_test_lr)

0.5588020452885318 0.5084269662921348


## Random Forest

In [9]:
#Random Forest
clf_rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 100)
clf_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
train_pred_rf = clf_rf.predict(X_train)
test_pred_rf = clf_rf.predict(X_test)

In [11]:
f1_train_rf = f1_score(y_train, train_pred_rf)
f1_test_rf = f1_score(y_test, test_pred_rf)
print(f1_train_rf, f1_test_rf)

0.7756272401433693 0.7877094972067039


## Adaboost 

In [15]:
clf_boost = AdaBoostClassifier(n_estimators = 200, random_state = 0, learning_rate = 0.5)
clf_boost.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.5, n_estimators=200, random_state=0)

In [16]:
train_pred_boost = clf_boost.predict(X_train)
test_pred_boost = clf_boost.predict(X_test)

In [17]:
f1_train_boost = f1_score(y_train, train_pred_boost)
f1_test_boost = f1_score(y_test, test_pred_boost)
print(f1_train_boost, f1_test_boost)

0.7696551724137931 0.8042328042328041


## SVM

In [None]:
clf_svm = SVC(kernel="linear", shrinking=False, gamma = "auto")
clf_svm.fit(X_train, y_train)

In [None]:
train_pred_svm = clf_svm.predict(X_train)
test_pred_svm = clf_svm.predict(X_test)

In [None]:
f1_train_svm = f1_score(y_train, train_pred_svm)
f1_test_svm = f1_score(y_test, test_pred_svm)
print(f1_train_svm, f1_test_svm)

### Save models

In [18]:
def save_object(o, filename):
    with open(filename, 'wb')  as f:
        pickle.dump(o, f)

In [22]:
save_object(clf_lr, './model/lr.obj')
save_object(clf_rf, './model/randomforest.obj')
save_object(clf_boost, './model/adaboost.obj')
save_object(clf_svm, './model/svm.obj')

In [25]:
save_object(scaler, './model/scaler.obj')
save_object(mean, './model/mean.obj')