In [2]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')

from imblearn.over_sampling import SMOTE

#Supress warnings and default INFO logging
import warnings
warnings.filterwarnings('ignore')

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

%matplotlib inline


import seaborn as sns
sns.set()

In [3]:
# !pip install imblearn

In [4]:
filepath = "./TrainingSet.csv"
df = pd.read_csv(filepath)

In [5]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V211,V212,V213,V214,V215,V216,V217,V218,V219,Machine_State
0,,5.135988,5.0,5.28125,0.058368,2059.53125,0.109375,,5.135988,5.0,...,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,Good
1,,5.908042,5.75,6.09375,0.063232,2369.125,0.109375,,5.908042,5.75,...,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,Good
2,0.0,9.24487,0.1875,13.75,3.890521,3707.19305,12.91253,0.0,9.24487,0.1875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Good
3,0.19375,18.247452,1.4375,20.25,3.328545,7317.22805,14.43752,0.19375,18.247452,1.4375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Good
4,0.19375,21.64209,1.4375,25.5938,6.094741,8678.47815,21.61566,0.19375,21.64209,1.4375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Good


# Part A: Preprocessing

In [6]:
df = df.fillna(0)

In [7]:
df['Machine_State'].unique()

array(['Good', 'Bad', 0], dtype=object)

In [8]:
x = pd.DataFrame(df.dtypes).T
x.to_csv('./feature_file.csv')

In [59]:
print("Total number of data samples belonging to type 0 =", len(df[df['Machine_State']==0]))
print("Total number of data samples belonging to type Good =",len(df[df['Machine_State']=='Good']))
print("Total number of data samples belonging to type Bad =",len(df[df['Machine_State']=='Bad']))

Total number of data samples belonging to type 0 = 19
Total number of data samples belonging to type Good = 3240
Total number of data samples belonging to type Bad = 463


**Clearly the given data is imbalanced.**

In [10]:
# Drop the rows that belong to undefined class
data = df[df['Machine_State']!=0]

In [11]:
data['Machine_State'].unique()

array(['Good', 'Bad'], dtype=object)

In [12]:
data["Machine_State"].replace({"Good": 0, "Bad": 1}, inplace=True)

In [13]:
data.drop_duplicates(inplace=True)

In [14]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V211,V212,V213,V214,V215,V216,V217,V218,V219,Machine_State
0,0.0,5.135988,5.0,5.28125,0.058368,2059.53125,0.109375,0.0,5.135988,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,5.908042,5.75,6.09375,0.063232,2369.125,0.109375,0.0,5.908042,5.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,9.24487,0.1875,13.75,3.890521,3707.19305,12.91253,0.0,9.24487,0.1875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.19375,18.247452,1.4375,20.25,3.328545,7317.22805,14.43752,0.19375,18.247452,1.4375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.19375,21.64209,1.4375,25.5938,6.094741,8678.47815,21.61566,0.19375,21.64209,1.4375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [15]:
X = data.drop('Machine_State', axis=1)
y = data['Machine_State']

In [16]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## 1 Upsampling of data

In [29]:
# upsampling the data 
sm = SMOTE(random_state=2)
X_up, y_up = sm.fit_sample(X_scaled, y)

In [64]:
print("Total number of data samples belonging to type Good =",y_up.value_counts()[0])
print("Total number of data samples belonging to type Bad =",y_up.value_counts()[1])

Total number of data samples belonging to type Good = 3229
Total number of data samples belonging to type Bad = 3229


In [31]:
X_train_up, X_val_up, y_train_up, y_val_up = train_test_split(X_up,y_up,test_size=0.2,random_state=66,shuffle=True)

## 2 Random upsampling + downsampling

In [32]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [33]:
# define oversampling strategy
over_sample = RandomOverSampler(sampling_strategy=0.2)
X_over, y_over = over_sample.fit_resample(X, y)

under_sample = RandomUnderSampler(sampling_strategy=0.5)
# fit and apply the transform
X_under, y_under = under_sample.fit_resample(X_over, y_over)

**Original data distribution**

In [61]:
print("Total number of data samples belonging to type Good =",y.value_counts()[0])
print("Total number of data samples belonging to type Bad =",y.value_counts()[1])

Total number of data samples belonging to type Good = 3229
Total number of data samples belonging to type Bad = 463


**Data distritution after random oversampling** 

In [62]:
print("Total number of data samples belonging to type Good =",y_over.value_counts()[0])
print("Total number of data samples belonging to type Bad =",y_over.value_counts()[1])

Total number of data samples belonging to type Good = 3229
Total number of data samples belonging to type Bad = 645


**Data distritution after random oversampling + random undersampling** 

In [63]:
print("Total number of data samples belonging to type Good =",y_under.value_counts()[0])
print("Total number of data samples belonging to type Bad =",y_under.value_counts()[1])

Total number of data samples belonging to type Good = 1290
Total number of data samples belonging to type Bad = 645


## 3 Feature selection

In [37]:
fs = SelectKBest(score_func=f_classif, k=150)
# apply feature selection
X_selected = fs.fit_transform(X_under, y_under)

In [66]:
# Top 5 features

fs_top5 = SelectKBest(score_func=f_classif, k=5)
# apply feature selection
top5 = fs_top5.fit_transform(X_under, y_under)

# Part B: Model selection

## 1. Logistic regression

### 1.1 With 80-20 split

In [20]:
logreg = LogisticRegression(solver='lbfgs', max_iter=10000)

logreg.fit(X_train_up,y_train_up)
y_val_pred = logreg.predict(X_val_up)

In [23]:
def evaluating_metrics(y_val, y_val_pred, model_name):
    # In the above case since false negatives have higher penaltly, we need to maximize recall value. 
    # This can be best measured using F2 score

    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    f2_score = fbeta_score(y_val, y_val_pred, beta=2.0)
    print('Result for %s: precision=%.3f, recall=%.3f, f2_score=%.3f' % (model_name, precision, recall, f2_score))

In [24]:
model_name='Logistic regression'
evaluating_metrics(y_val_up, y_val_pred, model_name)

Result for Logistic regression: precision=0.828, recall=0.919, f2_score=0.899


### 1.2 With cross valication

In [25]:
from sklearn.metrics import make_scorer

In [26]:
def cross_validation(estimator, X, y, scoring, cv, return_train_score=False):
    scores = cross_validate(estimator, X, y, scoring=scoring, cv=cv, return_train_score=return_train_score, n_jobs=-1)
    print('Average test f2 score in 10 fold cross validation = %.2f'%(scores['test_f2_score'].mean()))
    return scores

In [27]:
#CV with upsampling
scorer  = {'f2_score': make_scorer(fbeta_score, beta=2)}
logreg_cv_up = cross_validation(logreg, X_up, y_up, scoring=scorer, cv=10, return_train_score=True)

Average test f2 score in 10 fold cross validation = 0.89


**Just using the upsampled data might overfit the model, since most of the minority class data is repeated hence use random upsampling + downsampling**

### 1.3 With random upsample + random downsampled data 

In [40]:
#CV with upsampling
logreg_cv_up = cross_validation(logreg, X_under, y_under, scoring=scorer, cv=10, return_train_score=True)

Average test f2 score in 10 fold cross validation = 0.53


### 1.4 With feature selection 

In [41]:
#CV with upsampling
logreg_cv_up = cross_validation(logreg, X_selected, y_under, scoring=scorer, cv=10, return_train_score=True)

Average test f2 score in 10 fold cross validation = 0.50


## 2. Random forest

In [42]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

### 2.1 With 80-20 split

In [43]:
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample')
rf_model.fit(X_train_up,y_train_up)
y_val_pred = rf_model.predict(X_val_up)

In [44]:
model_name = 'Random Forest'
evaluating_metrics(y_val_up, y_val_pred, model_name)

Result for Random Forest: precision=0.977, recall=0.988, f2_score=0.985


### 2.2 With cross validation

### i) RandomForestClassifier

In [47]:
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample')
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# CV with upsampling
random_forest_cv = cross_validation(rf_model, X_up, y_up, scoring=scorer, cv=cv)

Average test f2 score in 10 fold cross validation = 0.99


### ii) BalancedRandomForestClassifier

In [48]:
# Balanced Random forest
rf_model_2 = BalancedRandomForestClassifier(n_estimators=100)
random_forest_balanced_cv = cross_validation(rf_model_2, X, y, scoring=scorer, cv=cv)

Average test f2 score in 10 fold cross validation = 0.78


**Just using the upsampled data might overfit the model, since most of the minority class data is repeated hence use random upsampling + downsampling**

### 2.3 With random upsample + random downsampled data 

In [49]:
# Random forest
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample')
random_forest_cv = cross_validation(rf_model, X_under, y_under, scoring=scorer, cv=cv)

Average test f2 score in 10 fold cross validation = 0.86


In [50]:
# Balanced Random forest
rf_model_2 = BalancedRandomForestClassifier(n_estimators=100, class_weight='balanced_subsample')
random_forest_balanced_cv = cross_validation(rf_model_2, X_under, y_under, scoring=scorer, cv=cv)

Average test f2 score in 10 fold cross validation = 0.92


### 1.4 With feature selection 

In [51]:
# Random forest
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample')
random_forest_cv = cross_validation(rf_model, X_selected, y_under, scoring=scorer, cv=cv)

Average test f2 score in 10 fold cross validation = 0.85


In [52]:
# Balanced Random forest
rf_model_2 = BalancedRandomForestClassifier(n_estimators=100)
random_forest_balanced_cv = cross_validation(rf_model_2, X_selected, y_under, scoring=scorer, cv=cv)

Average test f2 score in 10 fold cross validation = 0.91


## 3. Ada Boost

In [53]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [54]:
ad_model = AdaBoostClassifier(n_estimators=1000)
ada_boost_cv = cross_validation(ad_model, X_selected, y_under, scoring=scorer, cv=cv)

Average test f2 score in 10 fold cross validation = 0.86


## 4. Gradient Boost

In [55]:
model_GB = GradientBoostingClassifier(n_estimators=1000)
GB_cv = cross_validation(model_GB, X_selected, y_under, scoring=scorer, cv=cv)

Average test f2 score in 10 fold cross validation = 0.90


**Clearly BalancedRandomForestClassifier with selected features seems to be performing the best interms of f2 score and time for training**

## TODO
1. ~Upsampling + downsampling~ 
2. ~Feature selection with L1 or t-test~
3. ~Standardization of data~ 
4. Visualize
5. ~Try Ada boosting and gradient boosting~
6. ~API endpoint~ 
7. Document 
8. ~Upload on git~
9. What will change in your model if the cost of testing a good device was $5000?
