<a href="https://colab.research.google.com/github/Subrina-Sirajee/Machine-Health-Prediction/blob/main/Machine_Health_Model's_Pipeline_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing necessary module for mounting Google Drive
from google.colab import drive

# Mounting Google Drive to access files
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
# Importing necessary libraries for data manipulation, machine learning, and model evaluation
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline  # For constructing pipelines
from sklearn.compose import ColumnTransformer  # For preprocessing of specific columns
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_predict, train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from joblib import dump  # For model serialization
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [3]:
# Path to the CSV file containing the dataset
data_path = '/content/drive/MyDrive/Datasets/labeled_data.csv'

# Reading the dataset from the CSV file into a pandas DataFrame
df = pd.read_csv(data_path)


In [4]:
# Separating features (X) and target variable (y) from the DataFrame
X = df.drop(columns=['health_status'])  # Features
y = df['health_status']  # Target variable

# Encoding the target variable using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Splitting the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train.head()

Unnamed: 0,Temperature Sensor-1,Vibration Sensor-1,Vibration Sensor-2,Vibration Sensor-3
4898,33.802727,910,4611,5763
1616,34.19915,514,759,519
6075,32.221367,4094,1576,4654
7333,21.374384,4523,5462,1056
5639,44.162132,2065,475,271


In [6]:
# ColumnTransformer to apply transformations to specific columns
trf1 = ColumnTransformer(
    transformers=[
        # Applying PowerTransformer to selected numerical features
        ('power_transform', PowerTransformer(), ['Temperature Sensor-1', 'Vibration Sensor-1', 'Vibration Sensor-2', 'Vibration Sensor-3']),
        # Applying MinMaxScaler to all numerical features except the target
        ('min_max_scaler', MinMaxScaler(), slice(0, -1))
    ])

# XGBoost classifier for modeling
trf2 = XGBClassifier()

In [7]:
# Creating a pipeline to sequentially apply transformations and model fitting
pipe = make_pipeline(trf1, trf2)

# Fitting the pipeline to the training data
pipe.fit(X_train, y_train)

In [8]:
# Making predictions using the trained pipeline
y_pred = pipe.predict(X_test)

In [9]:
y_pred

array([2, 0, 1, ..., 2, 1, 1])

In [13]:
# Calculating the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

1.0


In [17]:
# Performing cross-validation using cross_val_score
cv_accuracy = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()
print(cv_accuracy)

0.9975328947368421


In [19]:
# Creating a StratifiedKFold object for cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Generating cross-validated predictions using cross_val_predict
y_pred_cv = cross_val_predict(pipe, X, y, cv=cv)

# Printing the cross-validated predictions
print(y_pred_cv)

[0 0 0 ... 2 1 1]


In [20]:
# Generating the classification report
classification_rep = classification_report(y, y_pred_cv, target_names=label_encoder.classes_)

# Printing the classification report
print("Classification Report:\n", classification_rep)


Classification Report:
                  precision    recall  f1-score   support

     Bad Health       0.99      1.00      0.99      1081
    Good Health       1.00      1.00      1.00      3792
Moderate Health       1.00      0.99      1.00      2727

       accuracy                           1.00      7600
      macro avg       0.99      1.00      1.00      7600
   weighted avg       1.00      1.00      1.00      7600



In [21]:
# Defining the file path and name for the saved model
model_file_path = '/content/drive/MyDrive/Datasets/xgboost_pipeline_model.joblib'
model_name = 'xgboost_pipeline_model.joblib'

# Saving the pipeline (including preprocessing and model) to a joblib file
dump(pipe, model_file_path)

# Printing a confirmation message indicating that the model has been saved
print("Model saved as", model_name)


Model saved as xgboost_pipeline_model.joblib
