In [30]:
from pyphm.datasets.milling import MillingPrepMethodA
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import scipy.io as sio # for reading matlab files
import zipfile
import gdown
import os
import pickle
import glob
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore") # supress all the matplotlib deprecation warnings
from IPython.display import clear_output, display, Image # incase you want to clear the output of a cell
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
path_colab = Path.cwd().parent.parent / 'content'

if path_colab.exists():
    proj_dir = Path.cwd()
else:
    proj_dir = Path.cwd().parent

print(proj_dir)

c:\Users\stajyer\tspipe


In [3]:
sub_dir = proj_dir / 'data' / 'processed' / 'window1024_stride64_test'

df = pd.read_csv(sub_dir / 'milling_processed.csv.gz')
df.head()

Unnamed: 0,cut_id,cut_no,case,time,ae_spindle,ae_table,vib_spindle,vib_table,smcdc,smcac,tool_class
0,0_0,0,1,0.0,0.219727,0.272827,0.733643,2.116699,6.84082,0.124512,0
1,0_0,0,1,0.004,0.246582,0.322266,0.778809,2.277832,6.660156,-0.561523,0
2,0_0,0,1,0.008,0.294189,0.283813,0.758057,2.34375,6.508789,-2.099609,0
3,0_0,0,1,0.012,0.323486,0.26001,0.726318,2.44873,6.542969,-2.731934,0
4,0_0,0,1,0.016,0.290527,0.253296,0.653076,2.546387,6.621094,-3.505859,0


In [4]:
raw_data_dir = proj_dir / 'data' / 'raw'

raw_milling_data = pd.read_csv(raw_data_dir / 'milling' / 'mill.csv')
raw_milling_data = raw_milling_data.drop(['Unnamed: 0'], axis=1)

In [5]:
path_processed_dir = proj_dir / 'data' / 'processed' / 'window1024_stride64_test'
df_features_downloaded = pd.read_csv(path_processed_dir / 'milling_features_comp_stride64_len1024.csv')
df_features_downloaded.head()

Unnamed: 0,cut_id,smcac__time_reversal_asymmetry_statistic__lag_1,smcac__time_reversal_asymmetry_statistic__lag_2,smcac__time_reversal_asymmetry_statistic__lag_3,smcac__c3__lag_1,smcac__c3__lag_2,smcac__c3__lag_3,smcac__cid_ce__normalize_True,smcac__cid_ce__normalize_False,smcac__symmetry_looking__r_0.0,...,smcdc__query_similarity_count__query_None__threshold_0.0,"smcdc__matrix_profile__feature_""min""__threshold_0.98","smcdc__matrix_profile__feature_""max""__threshold_0.98","smcdc__matrix_profile__feature_""mean""__threshold_0.98","smcdc__matrix_profile__feature_""median""__threshold_0.98","smcdc__matrix_profile__feature_""25""__threshold_0.98","smcdc__matrix_profile__feature_""75""__threshold_0.98",cut_no,case,tool_class
0,87_20,-0.009862,-0.00882,0.002745,-0.997182,-0.733741,-0.35414,11.424614,17.72574,0.0,...,,3.789384,5.941307,4.691854,4.617381,3.912163,5.372661,87,11,1
1,87_21,-0.014233,-0.00694,0.003651,-1.006841,-0.739986,-0.355168,11.403917,17.642734,0.0,...,,3.022031,5.579988,3.749164,3.582706,3.339192,4.022824,87,11,1
2,87_22,-0.023152,-0.024342,-0.000312,-1.067194,-0.764072,-0.354402,11.388923,17.604872,0.0,...,,2.896524,5.421811,3.517452,3.349038,3.180649,3.620394,87,11,1
3,87_23,-0.008329,-0.001119,0.017044,-1.070455,-0.782522,-0.37435,11.327472,17.49656,0.0,...,,2.866619,5.212993,3.357202,3.245255,3.119372,3.488481,87,11,1
4,87_24,0.006926,0.018749,0.019649,-1.014428,-0.744961,-0.359926,11.280507,17.361086,0.0,...,,2.866619,4.435201,3.249217,3.202348,3.0327,3.403513,87,11,1


In [6]:
# get the percentage of each tool_class
df_p = df_features_downloaded.groupby("tool_class").size() / df_features_downloaded.shape[0] * 100
df_p = df_p.reset_index()
df_p.columns = ["tool_class", "percentage"]

# get the count of each tool_class
df_c = df_features_downloaded.groupby("tool_class").size().to_frame().reset_index()
df_c.columns = ["tool_class", "count"]

# merge the two dataframes
df_pc = df_p.merge(df_c, on="tool_class")[["tool_class", "count", "percentage"]]
df_pc["percentage"] = df_pc["percentage"].round(2)

# Turning the results into a binary classification problem. ###

df_b = {'tool_class': [0, 1], 
        'count': [df_pc['count'][0] + df_pc['count'][1], df_pc['count'][2]], 
        'percentage': [df_pc['percentage'][0] + df_pc['percentage'][1], df_pc['percentage'][2]]}

df_b = pd.DataFrame(data=df_b)

df_features_downloaded_b = df_features_downloaded
df_features_downloaded_b = df_features_downloaded_b.replace(1, 0)
df_features_downloaded_b = df_features_downloaded_b.replace(2, 1)

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from tsfresh.feature_selection import select_features
from tsfresh.feature_selection.relevance import calculate_relevance_table

imp = SimpleImputer(strategy='most_frequent')

df_features_downloaded_b = df_features_downloaded_b.dropna(axis=1, how='all')

imputed_df_tool_class = df_features_downloaded_b['tool_class']

imputed_df = imp.fit_transform(df_features_downloaded_b.drop(columns=['tool_class']))

scaler = StandardScaler()
imputed_df_sc = scaler.fit_transform(imputed_df)

imputed_df_sc = pd.DataFrame(imputed_df_sc, columns=df_features_downloaded_b.drop(columns=['tool_class']).columns)

relevance_table = calculate_relevance_table(imputed_df_sc, imputed_df_tool_class)

print(relevance_table)

relevance_table = relevance_table.sort_values("p_value", inplace=True)

                                                                                              feature  \
feature                                                                                                 
vib_spindle__quantile__q_0.7                                             vib_spindle__quantile__q_0.7   
vib_spindle__quantile__q_0.6                                             vib_spindle__quantile__q_0.6   
vib_spindle__fft_coefficient__attr_"abs"__coeff_0   vib_spindle__fft_coefficient__attr_"abs"__coeff_0   
vib_spindle__fft_coefficient__attr_"real"__coeff_0  vib_spindle__fft_coefficient__attr_"real"__coe...   
vib_spindle__quantile__q_0.8                                             vib_spindle__quantile__q_0.8   
...                                                                                               ...   
smcdc__number_crossing_m__m_0                                           smcdc__number_crossing_m__m_0   
smcdc__number_crossing_m__m_-1                         

In [8]:
df_selected = select_features(imputed_df_sc, imputed_df_tool_class)

print(df_selected)

      vib_spindle__quantile__q_0.7  vib_spindle__quantile__q_0.6  \
0                        -0.603003                     -0.603893   
1                        -0.603003                     -0.603893   
2                        -0.589502                     -0.603893   
3                        -0.589502                     -0.603893   
4                        -0.589502                     -0.603893   
...                            ...                           ...   
9035                     -0.514499                     -0.525499   
9036                     -0.514499                     -0.525499   
9037                     -0.529500                     -0.525499   
9038                     -0.529500                     -0.525499   
9039                     -0.529500                     -0.525499   

      vib_spindle__fft_coefficient__attr_"abs"__coeff_0  \
0                                             -0.609765   
1                                             -0.608637   
2     

In [9]:
#Imputed Raw Data
from sklearn.model_selection import train_test_split

#Forward selection

X = df_selected.iloc[:,:5]
y = imputed_df_tool_class

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.head()

Unnamed: 0,vib_spindle__quantile__q_0.7,vib_spindle__quantile__q_0.6,"vib_spindle__fft_coefficient__attr_""abs""__coeff_0","vib_spindle__fft_coefficient__attr_""real""__coeff_0",vib_spindle__quantile__q_0.8
6354,0.927056,0.948303,0.960278,0.960278,0.923971
1541,0.237029,0.227081,0.190128,0.190128,0.247192
3133,1.150564,1.183484,1.201322,1.201322,1.125565
322,-0.01948,-0.008101,0.00153,0.00153,-0.0408
2031,-0.184487,-0.180567,-0.172324,-0.172324,-0.184795


K-Nearest Neighbours

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
scores = cross_val_score(knn, X, y, cv=5)

accuracy = accuracy_score(y_test, y_pred_knn)
precision = precision_score(y_test, y_pred_knn, average='weighted')
recall = recall_score(y_test, y_pred_knn, average='weighted')
f1 = f1_score(y_test, y_pred_knn, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_knn)
class_report = classification_report(y_test, y_pred_knn)
cross_val_mean = scores.mean()
cross_val_std = scores.std()

print(f"Classifier: K-Neighbors Classifier")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"Cross Validation Mean: {cross_val_mean}")
print(f"Cross Validation Standard Deviation: {cross_val_std}")
print("----------------------------------------")

Classifier: K-Neighbors Classifier
Accuracy: 0.9605457227138643
Precision: 0.9586979840756734
Recall: 0.9605457227138643
F1 Score: 0.959369270903126
Confusion Matrix:
[[2465   41]
 [  66  140]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2506
           1       0.77      0.68      0.72       206

    accuracy                           0.96      2712
   macro avg       0.87      0.83      0.85      2712
weighted avg       0.96      0.96      0.96      2712

Cross Validation Mean: 0.8904867256637168
Cross Validation Standard Deviation: 0.02483647277344634
----------------------------------------


In [28]:
with open('knn_model.pkl', 'wb') as knn_file:
    pickle.dump(knn, knn_file)

Accuracy: 0.9605457227138643


RANDOM FOREST

In [29]:
from sklearn.ensemble import RandomForestClassifier

rff = RandomForestClassifier()
rff.fit(X_train, y_train)
y_pred_knn = rff.predict(X_test)
scores = cross_val_score(rff, X, y, cv=5)

accuracy = accuracy_score(y_test, y_pred_knn)
precision = precision_score(y_test, y_pred_knn, average='weighted')
recall = recall_score(y_test, y_pred_knn, average='weighted')
f1 = f1_score(y_test, y_pred_knn, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_knn)
class_report = classification_report(y_test, y_pred_knn)
cross_val_mean = scores.mean()
cross_val_std = scores.std()

print(f"Classifier: K-Neighbors Classifier")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"Cross Validation Mean: {cross_val_mean}")
print(f"Cross Validation Standard Deviation: {cross_val_std}")
print("----------------------------------------")

Classifier: K-Neighbors Classifier
Accuracy: 0.952433628318584
Precision: 0.9511282004029681
Recall: 0.952433628318584
F1 Score: 0.9517202661918303
Confusion Matrix:
[[2448   58]
 [  71  135]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      2506
           1       0.70      0.66      0.68       206

    accuracy                           0.95      2712
   macro avg       0.84      0.82      0.83      2712
weighted avg       0.95      0.95      0.95      2712

Cross Validation Mean: 0.8929203539823009
Cross Validation Standard Deviation: 0.023509174771931103
----------------------------------------


In [31]:
with open('rf_model.pkl', 'wb') as rf_file:
    pickle.dump(rff, rf_file)

DECISION TREE

In [32]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_knn = dt.predict(X_test)
scores = cross_val_score(dt, X, y, cv=5)

accuracy = accuracy_score(y_test, y_pred_knn)
precision = precision_score(y_test, y_pred_knn, average='weighted')
recall = recall_score(y_test, y_pred_knn, average='weighted')
f1 = f1_score(y_test, y_pred_knn, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_knn)
class_report = classification_report(y_test, y_pred_knn)
cross_val_mean = scores.mean()
cross_val_std = scores.std()

print(f"Classifier: K-Neighbors Classifier")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"Cross Validation Mean: {cross_val_mean}")
print(f"Cross Validation Standard Deviation: {cross_val_std}")
print("----------------------------------------")

Classifier: K-Neighbors Classifier
Accuracy: 0.946165191740413
Precision: 0.9471451061546725
Recall: 0.946165191740413
F1 Score: 0.9466348707801565
Confusion Matrix:
[[2429   77]
 [  69  137]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2506
           1       0.64      0.67      0.65       206

    accuracy                           0.95      2712
   macro avg       0.81      0.82      0.81      2712
weighted avg       0.95      0.95      0.95      2712

Cross Validation Mean: 0.8901548672566373
Cross Validation Standard Deviation: 0.02458292156949943
----------------------------------------


In [33]:
with open('dt_model.pkl', 'wb') as dt_file:
    pickle.dump(dt, dt_file)

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

classifiers = [

    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree Classifier', DecisionTreeClassifier()),
    ('Random Forest Classifier', RandomForestClassifier()),
    ('Gradient Boosting Classifier', GradientBoostingClassifier()),
    ('K-Nearest Neighbors Classifier', KNeighborsClassifier())

]

voting = VotingClassifier(estimators=classifiers, voting="soft")

voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
scores = cross_val_score(voting, X, y, cv=5)

def evaluate(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print("Training Set Performance:")
    print(f"Accuracy: {accuracy_score(y_train, y_train_pred)}")
    print(f"Precision: {precision_score(y_train, y_train_pred, average='weighted')}")
    print(f"Recall: {recall_score(y_train, y_train_pred, average='weighted')}")
    print(f"F1 Score: {f1_score(y_train, y_train_pred, average='weighted')}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_train, y_train_pred))
    print("Classification Report:")
    print(classification_report(y_train, y_train_pred))

    print("\nTest Set Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
    print(f"Precision: {precision_score(y_test, y_test_pred, average='weighted')}")
    print(f"Recall: {recall_score(y_test, y_test_pred, average='weighted')}")
    print(f"F1 Score: {f1_score(y_test, y_test_pred, average='weighted')}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_test_pred))

    print(f"Cross Validation Mean: {scores.mean()}")
    print(f"Cross Validation Standard Deviation: {scores.std()}")

evaluate(voting, X_train, X_test, y_train, y_test)

Training Set Performance:
Accuracy: 0.9916245259165614
Precision: 0.9916994710670632
Recall: 0.9916245259165614
F1 Score: 0.9913861620943853
Confusion Matrix:
[[5870    0]
 [  53  405]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5870
           1       1.00      0.88      0.94       458

    accuracy                           0.99      6328
   macro avg       1.00      0.94      0.97      6328
weighted avg       0.99      0.99      0.99      6328


Test Set Performance:
Accuracy: 0.9553834808259587
Precision: 0.9520999056067361
Recall: 0.9553834808259587
F1 Score: 0.9529736369726993
Confusion Matrix:
[[2467   39]
 [  82  124]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2506
           1       0.76      0.60      0.67       206

    accuracy                           0.96      2712
   macro avg       0.86      0.79      0

In [35]:
with open('voting_5_soft.pkl', 'wb') as voting_file:
    pickle.dump(voting, voting_file)