### Importing the data

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
from numpy.fft import fft, ifft

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [3]:
df = pd.read_csv('DF.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ps1_1,ps1_2,ps1_3,ps1_4,ps1_5,ps1_6,ps1_7,ps1_8,ps1_9,...,vs1_51,vs1_52,vs1_53,vs1_54,vs1_55,vs1_56,vs1_57,vs1_58,vs1_59,vs1_60
0,0,151.47,151.45,151.52,151.27,150.8,150.69,153.89,154.67,152.88,...,0.554,0.552,0.545,0.553,0.553,0.539,0.544,0.545,0.535,0.543
1,1,151.11,151.12,151.16,150.92,150.7,150.62,152.4,153.21,152.81,...,0.555,0.547,0.548,0.544,0.536,0.542,0.54,0.533,0.531,0.534
2,2,150.81,150.79,150.84,150.65,150.35,150.23,152.03,152.81,152.44,...,0.543,0.544,0.543,0.554,0.544,0.544,0.545,0.544,0.53,0.534
3,3,150.48,150.47,150.52,150.31,150.04,149.98,151.63,152.48,152.24,...,0.549,0.538,0.553,0.543,0.553,0.555,0.544,0.543,0.543,0.542
4,4,150.41,150.35,150.24,150.12,149.87,149.71,151.64,152.37,151.78,...,0.546,0.546,0.544,0.552,0.539,0.54,0.549,0.542,0.533,0.537


In [4]:
profile = pd.read_csv('profile.txt', delimiter = '\t', header = None)

In [5]:
profile.columns = ["Cooler Condition","Valve Condition","Internal Pump Leakage","Hydraulic Accumulator","Stable Flag"]

In [6]:
profile_stable = profile['Stable Flag']

In [9]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [10]:
#df = df.reset_index()
#df = df.rename(columns = {'index':'Cycle'})
#Final = pd.concat([df, profile], axis = 1)
#Final.head()

## test - train split

In [12]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, profile_stable, train_size=0.7, test_size=0.3, random_state=42,stratify = profile_stable)

In [13]:
y_traindf = pd.DataFrame(y_train)
y_testdf = pd.DataFrame(y_test)

print(y_train.value_counts())
print(y_test.value_counts())

0    1014
1     529
Name: Stable Flag, dtype: int64
0    435
1    227
Name: Stable Flag, dtype: int64


### Balancing class using SMOTE

In [14]:

## SMOTE method is used to balance data
from imblearn import under_sampling 
from imblearn import over_sampling
from imblearn.over_sampling import SMOTE

smt = SMOTE(random_state=45, k_neighbors=5)
X_train, y_train = smt.fit_sample(X_train,y_train)
X_train = pd.DataFrame(X_train, columns= X_train.columns)

#y_train_smt = pd.DataFrame(y_train_smt)

### Standardisation of data

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() ## x-mean/std

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
X_train_scaled = pd.DataFrame(X_train_scaled)
X_test_scaled = pd.DataFrame(X_test_scaled)

In [13]:
X_train_scaled.columns = df.columns 
X_test_scaled.columns = df.columns 

In [14]:
print(X_test_scaled.shape)
print(X_train_scaled.shape)

(662, 43681)
(2028, 43681)


In [15]:
X_train_scaled = X_train_scaled.drop(['Unnamed: 0'], axis = 1)
X_test_scaled = X_test_scaled.drop(['Unnamed: 0'], axis = 1)

(2028, 43681)

# Extracting features using Tsfresh

In [16]:
from tsfresh import select_features
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_relevant_features

In [17]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters

In [None]:
#pip install git+https://github.com/blue-yonder/tsfresh

### Extract features from all sensors 


In [19]:
def tsfreshextract(df):
    df["id"] = df.index
    df = df.melt(id_vars="id", var_name="time").sort_values(["id", "time"]).reset_index(drop=True)
    df['T'] = df.time.str.extract('(\d+)')
    df['T'].astype(int)
    
#     y = profile['Stable Flag']
#     y = pd.DataFrame(y)
#     y = y.reset_index()
#     y = y.rename(columns = {'index':'id'})
    
#     new = pd.merge(left = df, right = y, how = 'inner', on = ['id'])
    
#     y = new.pop('Stable Flag')
    
    df = df.drop(['time'], axis = 1)
    
    X = extract_features(df, column_id="id", column_sort="T", default_fc_parameters = MinimalFCParameters(), 
                         n_jobs = 1, impute_function=impute)
   
    
    return X

In [20]:
X_trainextract = tsfreshextract(X_train_scaled)


KeyboardInterrupt: 

In [None]:
X_testextract = tsfreshextract(X_test_scaled)


In [None]:
X_trainextract.to_csv('X_trainextract.csv')

In [None]:
X_testextract.to_csv('X_testextract.csv')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import cross_validate,GridSearchCV,RandomizedSearchCV

from scipy.stats import randint
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
def evaluate_model(dt_classifier,y_train,X_train,y_test,X_test):
    print("Train Precision :", (precision_score(y_train, dt_classifier.predict(X_train)))*100)
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Precision :", (precision_score(y_test, dt_classifier.predict(X_test)))*100)
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

### LR

In [None]:
l_classifier = LogisticRegression(random_state=100, max_iter= 400)

In [None]:
l_classifier.fit(X_trainextract,y_train)

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = l_classifier.predict_proba(X_testextract)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
print(classification_report(y_test, l_classifier.predict(X_testextract)))

### Noise 

In [None]:
import numpy as np 
mu, sigma = 0, np.std(X_testextract)*0.20
noise = np.random.normal(mu, sigma, X_testextract.shape)

X_testnoise = X_testextract + noise

In [None]:
print(classification_report(y_test, l_classifier.predict(X_testnoise)))

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = l_classifier.predict_proba(X_testnoise)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## SVM

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
svc=SVC(kernel="rbf",probability=True)

In [None]:
svc.fit(X_trainextract,y_train)

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = svc.predict_proba(X_testextract)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
print(classification_report(y_test, svc.predict(X_testextract)))

In [None]:
print(classification_report(y_test, svc.predict(X_testnoise)))

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = svc.predict_proba(X_testnoise)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### RF 

In [None]:
rfc=RandomForestClassifier(n_estimators=100,random_state=42)

In [None]:
rfc.fit(X_trainextract,y_train)

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = rfc.predict_proba(X_testextract)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Importing required packages for visualization
from IPython.display import Image  
#from sklearn.externals.six import StringIO  
#from sklearn.tree import export_graphviz
#import pydotplus, graphviz
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1 , class_weight = 'balanced')

In [None]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [1, 2, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'max_features': [5,9],
    'n_estimators': [10, 30, 50, 100, 200]
}

## maximum depth is 1,2,5,10,20
## minimum sample leaf is 5,10,20,50,100
## maximum no. of features is 2,3,4
## no. of trees is 10, 30,50,100,200

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=classifier_rf, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "precision")

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=classifier_rf, param_grid=params, 
                          cv=4, n_jobs=-1, verbose=1, scoring = "precision")

In [None]:
%%time
grid_search.fit(X_trainextract,y_train)

In [None]:
rf_best = grid_search.best_estimator_ ## best estimator is assigned to rf_best
rf_best

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = rf_best.predict_proba(X_testextract)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
print(classification_report(y_test, rf_best.predict(X_testextract)))

### Noise

In [None]:
print(classification_report(y_test, rf_best.predict(X_testnoise)))

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = rf_best.predict_proba(X_testnoise)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()