In [111]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




In [113]:
"""load data"""
data = pd.read_csv("/Users/wangtiles/AWS_Quicksight_project/ai4i2020.csv")
# data_parquet = pd.read_parquet("/Users/wangtiles/AWS_Quicksight_project/ai4i2020.parquet", engine = "pyarrow")

In [132]:
"""exploring"""
# print(data.info())
# (data.head(0))


# (data_parquet.head())

# data_parquet
# data_parquet.head()
data.describe()
(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
 14  temperature_diff       

In [117]:
"""
GOAL:
    Predict machine failure (binary classification)
    Visualizing results in Amazon QuickSight.
"""


'\nGOAL:\n    Predict machine failure (binary classification)\n    Visualizing results in Amazon QuickSight.\n'

In [118]:
"""Feature engineering"""

data['temperature_diff'] = data['Process temperature [K]'] - data['Air temperature [K]']
data['power'] = data['Torque [Nm]'] * data['Rotational speed [rpm]'] #since power failure depends on this

#save the cleaned data
data.to_csv("/Users/wangtiles/AWS_Quicksight_project/dataset/cleaned_data.csv", index=False)




In [122]:
"""Developing the ML model"""
features = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 
            'Torque [Nm]', 'Tool wear [min]', 'temperature_diff', 'power']
X = data[features]
Y = data['Machine failure']

#splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

# training a model
# Random Forest classifier
randforest = RandomForestClassifier(random_state = 42)
randforest.fit(X_train, Y_train)

#training accuracy
Y_train_pred = randforest.predict(X_train)
train_accuracy = accuracy_score(Y_train, Y_train_pred)
print("Training Accuracy:", train_accuracy)

#testing accuracy
Y_pred = randforest.predict(X_test)
print("Testing Accuracy:", accuracy_score(Y_test, Y_pred))

#classification report with precision, recall, f1 score values of both the classes (0 and 1)
print("Classification Report:\n", classification_report(Y_test, Y_pred))

#confusion matrix
conf_mat = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:\n", conf_mat)





Training Accuracy: 1.0
Testing Accuracy: 0.9905
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      1939
           1       0.96      0.72      0.82        61

    accuracy                           0.99      2000
   macro avg       0.97      0.86      0.91      2000
weighted avg       0.99      0.99      0.99      2000

Confusion Matrix:
 [[1937    2]
 [  17   44]]


In [124]:
"""Saving predictions"""
data['predicted_failure'] = randforest.predict(X)
data.to_csv("/Users/wangtiles/AWS_Quicksight_project/dataset/predictions.csv", index=False)

In [None]:
"""Now onto AWS!!!"""