# 1. Opening everything and creating the tables

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Analysing defects seperatly
defects_df = pd.read_csv('/Users/julien/Desktop/Impact Project/all_files/data_processed/Defects.csv', delimiter = ";")
defects_df['time'] = pd.to_datetime(defects_df['time'], format='%Y-%m-%d %H:%M:%S')
defects_df

In [None]:
defects_df['TraceabilityCode'].value_counts()

In [None]:
# Set the folder path where your CSV files are located
folder_path = '/Users/julien/Desktop/Impact Project/all_files/data_processed'

# Get the list of all csv files in the folder except 'Defects.csv'
file_list = [file for file in os.listdir(folder_path) if file.endswith('.csv')and file != 'Defects.csv' and file.endswith('values.csv')]

# Order the list alphabetically
file_list.sort()

file_list

In [None]:
len(file_list)

In [None]:
# Open all files in the folder and store them as a dataframe
df_list = []
for file in file_list:
    df = pd.read_csv(folder_path+'/'+file, delimiter = ";")
    df_list.append(df)

# Concatenate all dataframes into one
df = pd.concat(df_list, axis=0, ignore_index=True)

#df

In [None]:
df_new = df.groupby('TraceabilityCode').first().reset_index()
df_new

In [None]:
df_new['TraceabilityCode'].nunique()

In [None]:
# Merge the two dataframes
df_merged = pd.merge(df_new, defects_df, on='TraceabilityCode', how='outer')
df_merged

In [None]:
# In df_merged, if in the column 'TraceabilityCode' a value appears twice, merge them together, in case of conflict in another column, pick the first value
df_merged_new = df_merged.groupby('TraceabilityCode').first().reset_index()
df_merged_new

In [None]:
# Show the number of null values in each column
df_merged_new.isnull().sum()

In [2]:
df_merged_new=pd.read_csv("C:/Users/yara-/OneDrive/Desktop/capstone/merged.csv", delimiter = ";")

  df_merged_new=pd.read_csv("C:/Users/yara-/OneDrive/Desktop/capstone/merged.csv", delimiter = ";")


# 2. Feature Engineering

In [3]:
# Drop empty columns from the DataFrame
df_merged_new.dropna(axis=1, how='all', inplace=True)

# Display the updated DataFrame
print(df_merged_new.head())

   TraceabilityCode time_x AmbientHumidity time.value AmbientTemperature  \
0  1010122318135217    NaN             NaN        NaN                NaN   
1  1010122318135259    NaN             NaN        NaN                NaN   
2  1010122318135412    NaN             NaN        NaN                NaN   
3  1010122318141922    NaN             NaN        NaN                NaN   
4  1010122319141031    NaN             NaN        NaN                NaN   

  ClutchBrakeTemperatureUnitOil ClutchBrakeWaterTemperature  \
0                           NaN                         NaN   
1                           NaN                         NaN   
2                           NaN                         NaN   
3                           NaN                         NaN   
4                           NaN                         NaN   

   CushionOilDegradation CushionPumpMaxPower1 CushionPumpMaxPower2  ...  \
0                    NaN                  NaN                  NaN  ...   
1             

In [4]:
# Add a new column "Has_Defect" with binary values
df_merged_new["Has_Defect"] = np.where(df_merged_new["Defect"].notnull(), 1, 0)
df_merged_new

Unnamed: 0,TraceabilityCode,time_x,AmbientHumidity,time.value,AmbientTemperature,ClutchBrakeTemperatureUnitOil,ClutchBrakeWaterTemperature,CushionOilDegradation,CushionPumpMaxPower1,CushionPumpMaxPower2,...,SlidePositionDown,SlidePositionUp,SlideSpeedDown,SlideSpeedUp,TotalMaxForce,TotalPressForceValue,WorkStatusCylinder,time_y,Defect,Has_Defect
0,1010122318135217,,,,,,,,,,...,,,,,,,,2022-11-14 15:04:29,Odciśnięte miejsce (marks),1
1,1010122318135259,,,,,,,,,,...,,,,,,,,2022-11-14 15:06:25,Inne,1
2,1010122318135412,,,,,,,,,,...,,,,,,,,2022-11-14 15:11:43,Odciśnięte miejsce (marks),1
3,1010122318141922,,,,,,,,,,...,,,,,,,,2022-11-14 15:22:56,Odciśnięte miejsce (marks),1
4,1010122319141031,,,,,,,,,,...,,,,,,,,2022-11-15 15:54:59,Inne,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143989,1012623048222115,2023-02-17 23:21:18.062 CET,,2023-02-17 23:21:23.461 CET,,451,,,,,...,-23,221,-1328,2045,442,394,,,,0
143990,1012723011144348,,,,,,,,,,...,,,,,,,,2023-01-11 15:45:25,Deformacja (deformation),1
143991,1012723011145624,,,,,,,,,,...,,,,,,,,2023-01-11 16:01:25,Deformacja (deformation),1
143992,1012823068072827,,,,,,,,,,...,,,,,,,,2023-03-09 08:30:41,Deformacja (deformation),1


In [5]:
df_merged_new["Has_Defect"].value_counts()

0    143466
1       528
Name: Has_Defect, dtype: int64

In [6]:
# Count the number of null values in each column
null_counts = df_merged_new.isnull().sum()

# Display the null value counts for each column
for column, count in null_counts.items():
    print(f"Column '{column}' has {count} null value(s).")

Column 'TraceabilityCode' has 0 null value(s).
Column 'time_x' has 237 null value(s).
Column 'AmbientHumidity' has 356 null value(s).
Column 'time.value' has 237 null value(s).
Column 'AmbientTemperature' has 70084 null value(s).
Column 'ClutchBrakeTemperatureUnitOil' has 5239 null value(s).
Column 'ClutchBrakeWaterTemperature' has 90190 null value(s).
Column 'CushionOilDegradation' has 143993 null value(s).
Column 'CushionPumpMaxPower1' has 102276 null value(s).
Column 'CushionPumpMaxPower2' has 98421 null value(s).
Column 'CushionPumpMaxPower3' has 98218 null value(s).
Column 'CushionPumpMeanPower1' has 75383 null value(s).
Column 'CushionPumpMeanPower2' has 70696 null value(s).
Column 'CushionPumpMeanPower3' has 66212 null value(s).
Column 'CushionTemperatureUnitOil' has 86894 null value(s).
Column 'CushionWaterFlow' has 53306 null value(s).
Column 'CushionWaterTemperature' has 93245 null value(s).
Column 'Cylinder1MaxForce' has 58930 null value(s).
Column 'Cylinder1MaxParalelismErr

In [7]:
df_defects=df_merged_new

In [8]:
# Drop duplicate time column and the original Defect column

columns_to_drop = ['time_y', 'Defect']
df_merged_new = df_merged_new.drop(columns=columns_to_drop)

df_merged_new.head()

Unnamed: 0,TraceabilityCode,time_x,AmbientHumidity,time.value,AmbientTemperature,ClutchBrakeTemperatureUnitOil,ClutchBrakeWaterTemperature,CushionOilDegradation,CushionPumpMaxPower1,CushionPumpMaxPower2,...,SlideAccelerationUp,SlideAdjustment,SlidePositionDown,SlidePositionUp,SlideSpeedDown,SlideSpeedUp,TotalMaxForce,TotalPressForceValue,WorkStatusCylinder,Has_Defect
0,1010122318135217,,,,,,,,,,...,,,,,,,,,,1
1,1010122318135259,,,,,,,,,,...,,,,,,,,,,1
2,1010122318135412,,,,,,,,,,...,,,,,,,,,,1
3,1010122318141922,,,,,,,,,,...,,,,,,,,,,1
4,1010122319141031,,,,,,,,,,...,,,,,,,,,,1


In [9]:
# Count the number of null values in each column
null_counts = df_merged_new.isnull().sum()

# Display the null value counts for each column
for column, count in null_counts.items():
    print(f"Column '{column}' has {count} null value(s).")

Column 'TraceabilityCode' has 0 null value(s).
Column 'time_x' has 237 null value(s).
Column 'AmbientHumidity' has 356 null value(s).
Column 'time.value' has 237 null value(s).
Column 'AmbientTemperature' has 70084 null value(s).
Column 'ClutchBrakeTemperatureUnitOil' has 5239 null value(s).
Column 'ClutchBrakeWaterTemperature' has 90190 null value(s).
Column 'CushionOilDegradation' has 143993 null value(s).
Column 'CushionPumpMaxPower1' has 102276 null value(s).
Column 'CushionPumpMaxPower2' has 98421 null value(s).
Column 'CushionPumpMaxPower3' has 98218 null value(s).
Column 'CushionPumpMeanPower1' has 75383 null value(s).
Column 'CushionPumpMeanPower2' has 70696 null value(s).
Column 'CushionPumpMeanPower3' has 66212 null value(s).
Column 'CushionTemperatureUnitOil' has 86894 null value(s).
Column 'CushionWaterFlow' has 53306 null value(s).
Column 'CushionWaterTemperature' has 93245 null value(s).
Column 'Cylinder1MaxForce' has 58930 null value(s).
Column 'Cylinder1MaxParalelismErr

In [10]:
# Some columns are not completely empty but almost, we define a threshold and drop them

# Define the threshold for null values to drop a column
null_threshold = 143993

# List to store columns to be dropped
columns_to_drop = []

# Loop through each column and check the null value count
for column, count in null_counts.items():
    print(f"Column '{column}' has {count} null value(s).")
    if count >= null_threshold:
        columns_to_drop.append(column)

# Drop columns with null values above the threshold
df_merged_new = df_merged_new.drop(columns=columns_to_drop)

Column 'TraceabilityCode' has 0 null value(s).
Column 'time_x' has 237 null value(s).
Column 'AmbientHumidity' has 356 null value(s).
Column 'time.value' has 237 null value(s).
Column 'AmbientTemperature' has 70084 null value(s).
Column 'ClutchBrakeTemperatureUnitOil' has 5239 null value(s).
Column 'ClutchBrakeWaterTemperature' has 90190 null value(s).
Column 'CushionOilDegradation' has 143993 null value(s).
Column 'CushionPumpMaxPower1' has 102276 null value(s).
Column 'CushionPumpMaxPower2' has 98421 null value(s).
Column 'CushionPumpMaxPower3' has 98218 null value(s).
Column 'CushionPumpMeanPower1' has 75383 null value(s).
Column 'CushionPumpMeanPower2' has 70696 null value(s).
Column 'CushionPumpMeanPower3' has 66212 null value(s).
Column 'CushionTemperatureUnitOil' has 86894 null value(s).
Column 'CushionWaterFlow' has 53306 null value(s).
Column 'CushionWaterTemperature' has 93245 null value(s).
Column 'Cylinder1MaxForce' has 58930 null value(s).
Column 'Cylinder1MaxParalelismErr

In [11]:
column_to_drop = 'WorkStatusCylinder'
df_merged_new = df_merged_new.drop(columns=column_to_drop)

df_merged_new.head()

Unnamed: 0,TraceabilityCode,time_x,AmbientHumidity,time.value,AmbientTemperature,ClutchBrakeTemperatureUnitOil,ClutchBrakeWaterTemperature,CushionPumpMaxPower1,CushionPumpMaxPower2,CushionPumpMaxPower3,...,SlideAccelerationDown,SlideAccelerationUp,SlideAdjustment,SlidePositionDown,SlidePositionUp,SlideSpeedDown,SlideSpeedUp,TotalMaxForce,TotalPressForceValue,Has_Defect
0,1010122318135217,,,,,,,,,,...,,,,,,,,,,1
1,1010122318135259,,,,,,,,,,...,,,,,,,,,,1
2,1010122318135412,,,,,,,,,,...,,,,,,,,,,1
3,1010122318141922,,,,,,,,,,...,,,,,,,,,,1
4,1010122319141031,,,,,,,,,,...,,,,,,,,,,1


In [12]:
df_merged_new.dtypes

TraceabilityCode         int64
time_x                  object
AmbientHumidity         object
time.value              object
AmbientTemperature      object
                         ...  
SlideSpeedDown          object
SlideSpeedUp            object
TotalMaxForce           object
TotalPressForceValue    object
Has_Defect               int32
Length: 69, dtype: object

In [13]:
df_merged_new = df_merged_new.apply(pd.to_numeric, errors='coerce')
df_merged_new.dtypes

TraceabilityCode          int64
time_x                  float64
AmbientHumidity         float64
time.value              float64
AmbientTemperature      float64
                         ...   
SlideSpeedDown          float64
SlideSpeedUp            float64
TotalMaxForce           float64
TotalPressForceValue    float64
Has_Defect                int32
Length: 69, dtype: object

In [14]:
# Fillna with median of each column
df_merged_new = df_merged_new.fillna(df_merged_new.median())
df_merged_new.describe()

Unnamed: 0,TraceabilityCode,time_x,AmbientHumidity,time.value,AmbientTemperature,ClutchBrakeTemperatureUnitOil,ClutchBrakeWaterTemperature,CushionPumpMaxPower1,CushionPumpMaxPower2,CushionPumpMaxPower3,...,SlideAccelerationDown,SlideAccelerationUp,SlideAdjustment,SlidePositionDown,SlidePositionUp,SlideSpeedDown,SlideSpeedUp,TotalMaxForce,TotalPressForceValue,Has_Defect
count,143994.0,0.0,143994.0,0.0,143994.0,143994.0,143994.0,143994.0,143994.0,143994.0,...,143994.0,143994.0,143994.0,143994.0,143994.0,143994.0,143994.0,143994.0,143994.0,143994.0
mean,1010176000000000.0,,19.011264,,25.988854,49.019112,30.007854,44.993916,47.999833,44.999083,...,232.212551,1504.685897,1096.991687,-116.612685,14.366863,-308.272678,181.093191,31.469089,30.96138,0.003667
std,95433680000.0,,0.521933,,0.263175,0.976971,0.272242,0.846831,0.851857,0.865396,...,100.837154,158.105646,4.353338,28.050935,5.437509,39.777181,30.069451,8.552816,9.610758,0.060443
min,1010122000000000.0,,11.0,,22.0,34.0,0.0,36.0,36.0,0.0,...,-115.0,610.0,0.0,-274.0,0.0,-424.0,29.0,5.0,0.0,0.0
25%,1010123000000000.0,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,229.0,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,0.0
50%,1010123000000000.0,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,229.0,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,0.0
75%,1010223000000000.0,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,229.0,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,0.0
max,1012823000000000.0,,30.0,,28.0,57.0,34.0,75.0,60.0,162.0,...,1011.0,2599.0,1450.0,-1.0,171.0,0.0,662.0,108.0,85.0,1.0


# 3. Models

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from scipy.stats import randint as sp_randint

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

def calculate_metrics(y_true, y_pred, y_pred_proba):
    metric_names = ['Accuracy', 'AUC-ROC', 'Precision', 'Recall', 'F1 Score', 'Gini Coefficient']
    metrics = []

    accuracy = accuracy_score(y_true, y_pred)
    auc_roc = roc_auc_score(y_true, y_pred_proba)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    gini = 2 * auc_roc - 1

    metrics.extend([accuracy, auc_roc, precision, recall, f1, gini])

    for name, metric in zip(metric_names, metrics):
        print(f"{name}: {metric}")

    #return metrics

### Basic Random Forest

In [None]:
# Basic Random Forest Classifier and prediction of probabilities

# Create the X and y variables
X = df_merged_new.drop(["Has_Defect", 'time_x', 'time.value'], axis=1)
y = df_merged_new["Has_Defect"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)

# Fit the model on the undersampled training data
rf_model = rf_model.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

y_pred = rf_model.predict(X_test)

In [None]:
calculate_metrics(y_test,y_pred,y_pred_proba)

### Random Forest with undersampling

In [None]:
# Create the X and y variables
X = df_merged_new.drop(["Has_Defect", 'time_x', 'time.value'], axis=1)
y = df_merged_new["Has_Defect"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Apply undersampling to the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)

# Fit the model on the undersampled training data
rf_model.fit(X_train_undersampled, y_train_undersampled)

# Predict probabilities on the test set
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Define the threshold value
threshold = 0.7

# Convert probabilities to binary predictions based on the threshold
y_pred = (y_pred_proba >= threshold).astype(int)

# Evaluate the ROC AUC score
#roc_auc = roc_auc_score(y_test, y_pred_proba)
#print(f"ROC AUC Score: {roc_auc}")

# Evaluate the ROC AUC score using binary predictions
roc_auc_binary = roc_auc_score(y_test, y_pred)
print(f"ROC AUC Score (Binary): {roc_auc_binary}")

In [None]:
calculate_metrics(y_test,y_pred,y_pred_proba)

#### With hyperparameter tuning

In [None]:
# Create the X and y variables
X = df_merged_new.drop(["Has_Defect", 'time_x', 'time.value'], axis=1)
y = df_merged_new["Has_Defect"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Apply undersampling to the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': sp_randint(100, 1000),
    'max_depth': sp_randint(3, 10),
    'min_samples_split': sp_randint(2, 10),
    'min_samples_leaf': sp_randint(1, 5),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Create a random forest classifier
rf_model = RandomForestClassifier(random_state=0)

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid, n_iter=10, cv=5, scoring='roc_auc', random_state=0)

# Fit the RandomizedSearchCV object on the undersampled training data
random_search.fit(X_train_undersampled, y_train_undersampled)

# Use the best estimator obtained from RandomizedSearchCV for prediction on the test set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

y_pred = best_model.predict(X_test)

In [None]:
calculate_metrics(y_test,y_pred,y_pred_proba)

### Random Forest with CV

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Create the X and y variables
X = df_merged_new.drop(["Has_Defect", 'time_x', 'time.value'], axis=1)
y = df_merged_new["Has_Defect"]

# Apply undersampling to the data
undersampler = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = undersampler.fit_resample(X, y)

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)

# Define the scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'auc_roc': 'roc_auc',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'gini': make_scorer(lambda y_true, y_pred: 2 * roc_auc_score(y_true, y_pred) - 1)
}

# Perform cross-validation and calculate the metrics
cv_results = cross_validate(rf_model, X_undersampled, y_undersampled, cv=5, scoring=scoring)

# Print the scores for each metric and each fold
print("Accuracy Scores for each fold:", cv_results['test_accuracy'])
print("AUC-ROC Scores for each fold:", cv_results['test_auc_roc'])
print("Precision Scores for each fold:", cv_results['test_precision'])
print("Recall Scores for each fold:", cv_results['test_recall'])
print("F1 Scores for each fold:", cv_results['test_f1'])
print("Gini Coefficients for each fold:", cv_results['test_gini'])

# Calculate and print the mean and standard deviation of each metric
print("Mean Accuracy:", cv_results['test_accuracy'].mean())
print("Mean AUC-ROC Score:", cv_results['test_auc_roc'].mean())
print("Mean Precision:", cv_results['test_precision'].mean())
print("Mean Recall:", cv_results['test_recall'].mean())
print("Mean F1 Score:", cv_results['test_f1'].mean())
print("Mean Gini Coefficient:", cv_results['test_gini'].mean())
print("Standard Deviation of Accuracy Scores:", cv_results['test_accuracy'].std())
print("Standard Deviation of AUC-ROC Scores:", cv_results['test_auc_roc'].std())
print("Standard Deviation of Precision Scores:", cv_results['test_precision'].std())
print("Standard Deviation of Recall Scores:", cv_results['test_recall'].std())
print("Standard Deviation of F1 Scores:", cv_results['test_f1'].std())
print("Standard Deviation of Gini Coefficients:", cv_results['test_gini'].std())

#### With hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Create the X and y variables
X = df_merged_new.drop(["Has_Defect", 'time_x', 'time.value'], axis=1)
y = df_merged_new["Has_Defect"]

# Apply undersampling to the data
undersampler = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = undersampler.fit_resample(X, y)

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': sp_randint(100, 1000),
    'max_depth': sp_randint(3, 10),
    'min_samples_split': sp_randint(2, 10),
    'min_samples_leaf': sp_randint(1, 5),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Create a random forest classifier
rf_model = RandomForestClassifier(random_state=0)

# Define the scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'auc_roc': 'roc_auc',
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'gini': make_scorer(lambda y_true, y_pred: 2 * roc_auc_score(y_true, y_pred) - 1)
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid, n_iter=10, cv=5, scoring=scoring, refit='auc_roc', random_state=0)

# Fit the RandomizedSearchCV object on the undersampled training data
random_search.fit(X_undersampled, y_undersampled)

# Print the best hyperparameters and the corresponding best score
print("Best Hyperparameters: ", random_search.best_params_)
print("Best Score: ", random_search.best_score_)

# Use the best estimator obtained from RandomizedSearchCV for prediction and evaluation on the test set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = best_model.predict(X_test)

In [None]:
calculate_metrics(y_test,y_pred,y_pred_proba)

### Ensemble Model using bagging 

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from scipy.stats import randint as sp_randint
from imblearn.under_sampling import RandomUnderSampler

# Create the X and y variables
X = df_merged_new.drop(["Has_Defect", 'time_x', 'time.value'], axis=1)
y = df_merged_new["Has_Defect"]

# Apply undersampling to the data
undersampler = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = undersampler.fit_resample(X, y)

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'base_estimator__max_depth': sp_randint(3, 10),
    'base_estimator__min_samples_split': sp_randint(2, 10),
    'base_estimator__min_samples_leaf': sp_randint(1, 5),
    'max_samples': [0.5, 0.7, 0.9],
    'max_features': [0.5, 0.7, 0.9],
    'n_estimators': sp_randint(100, 1000)
}

# Create a list to hold the ensemble models
ensemble_models = []

# Define the scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'auc_roc': 'roc_auc',
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'gini': make_scorer(lambda y_true, y_pred: 2 * roc_auc_score(y_true, y_pred) - 1)
}

# Create multiple BaggingClassifier models
num_models = 5  # Define the number of models in the ensemble
for i in range(num_models):
    # Create a decision tree base estimator
    base_estimator = DecisionTreeClassifier(random_state=i)
    
    # Create a BaggingClassifier
    bagging_model = BaggingClassifier(base_estimator=base_estimator, random_state=i)
    
    # Create a RandomizedSearchCV object
    random_search = RandomizedSearchCV(estimator=bagging_model, param_distributions=param_grid, n_iter=10, cv=5, scoring=scoring, refit='auc_roc', random_state=i)
    
    # Fit the RandomizedSearchCV object on the undersampled training data
    random_search.fit(X_undersampled, y_undersampled)
    
    # Add the best model to the ensemble
    ensemble_models.append(random_search.best_estimator_)

# Use the ensemble of models for prediction and evaluation on the test set
y_pred_probas = []
for model in ensemble_models:
    y_pred_probas.append(model.predict_proba(X_test)[:, 1])

# Combine the predictions of all models
y_pred_proba_avg = sum(y_pred_probas) / num_models
y_pred_avg = (y_pred_proba_avg > 0.5).astype(int)

In [None]:
calculate_metrics(y_test,y_pred_avg,y_pred_proba_avg)

Mentor: For next session do this
- Add other metrics (done)
    - F1
    - Precision/Recall, confusion matrix
    - Roc auc
    - RMSE
    - Skewness
    - Coin skappa metric
    - Gini
- Increase training data (no)
- Create other features
    - Mix/group values, which ones? Why? etc
    - Dimensionality reduction using PCA/LDA
- Add regularisation L1/l2 (just hyperparameter tuning for random forest)
- Feature selection for RF (i tbhink walid did this before)
- Remove outliers, but RF is robust for outliers
- Ensemble models (bagging)
- Use a RNN taking into account the steps
    - Reorder the variables in P1, …, P6
    - When the df is created, if variable is in p1 add one before, or something like this
-multiclass predictive model (top 3 and other)


# Multiclass models:

In [16]:
df_defects.head()

Unnamed: 0,TraceabilityCode,time_x,AmbientHumidity,time.value,AmbientTemperature,ClutchBrakeTemperatureUnitOil,ClutchBrakeWaterTemperature,CushionOilDegradation,CushionPumpMaxPower1,CushionPumpMaxPower2,...,SlidePositionDown,SlidePositionUp,SlideSpeedDown,SlideSpeedUp,TotalMaxForce,TotalPressForceValue,WorkStatusCylinder,time_y,Defect,Has_Defect
0,1010122318135217,,,,,,,,,,...,,,,,,,,2022-11-14 15:04:29,Odciśnięte miejsce (marks),1
1,1010122318135259,,,,,,,,,,...,,,,,,,,2022-11-14 15:06:25,Inne,1
2,1010122318135412,,,,,,,,,,...,,,,,,,,2022-11-14 15:11:43,Odciśnięte miejsce (marks),1
3,1010122318141922,,,,,,,,,,...,,,,,,,,2022-11-14 15:22:56,Odciśnięte miejsce (marks),1
4,1010122319141031,,,,,,,,,,...,,,,,,,,2022-11-15 15:54:59,Inne,1


In [17]:
df_merged_new.head()

Unnamed: 0,TraceabilityCode,time_x,AmbientHumidity,time.value,AmbientTemperature,ClutchBrakeTemperatureUnitOil,ClutchBrakeWaterTemperature,CushionPumpMaxPower1,CushionPumpMaxPower2,CushionPumpMaxPower3,...,SlideAccelerationDown,SlideAccelerationUp,SlideAdjustment,SlidePositionDown,SlidePositionUp,SlideSpeedDown,SlideSpeedUp,TotalMaxForce,TotalPressForceValue,Has_Defect
0,1010122318135217,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,229.0,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1
1,1010122318135259,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,229.0,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1
2,1010122318135412,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,229.0,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1
3,1010122318141922,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,229.0,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1
4,1010122319141031,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,229.0,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1


In [18]:
df_merged_new['Defect']=df_defects['Defect']

In [20]:
df_merged_new['Defect'].isnull().sum()

143466

In [19]:
df_merged_new

Unnamed: 0,TraceabilityCode,time_x,AmbientHumidity,time.value,AmbientTemperature,ClutchBrakeTemperatureUnitOil,ClutchBrakeWaterTemperature,CushionPumpMaxPower1,CushionPumpMaxPower2,CushionPumpMaxPower3,...,SlideAccelerationUp,SlideAdjustment,SlidePositionDown,SlidePositionUp,SlideSpeedDown,SlideSpeedUp,TotalMaxForce,TotalPressForceValue,Has_Defect,Defect
0,1010122318135217,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1,Odciśnięte miejsce (marks)
1,1010122318135259,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1,Inne
2,1010122318135412,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1,Odciśnięte miejsce (marks)
3,1010122318141922,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1,Odciśnięte miejsce (marks)
4,1010122319141031,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1,Inne
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143989,1012623048222115,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,1510.0,1097.0,-23.0,14.0,-315.0,181.0,31.0,30.0,0,
143990,1012723011144348,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1,Deformacja (deformation)
143991,1012723011145624,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1,Deformacja (deformation)
143992,1012823068072827,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,1510.0,1097.0,-118.0,14.0,-315.0,181.0,31.0,30.0,1,Deformacja (deformation)


In [19]:
df_merged_new["Defect"].value_counts()

Pikos (marks/contamination)          142
Deformacja (deformation)             113
Odciśnięte miejsce (marks)            95
Pofalowanie (waves)                   43
Przeciągnięcie (thinning)             43
Błąd materiału (meterial failure)     35
Pęknięcie (cracks)                    26
Rysa (scratch)                        13
Wgniot (dent)                         10
Inne                                   6
Korozja (corrosion)                    2
Name: Defect, dtype: int64

In [21]:

# Define the values to one-hot encode
values_to_encode = ['Pikos (marks/contamination)', 'Deformacja (deformation)', 'Odciśnięte miejsce (marks)']

# Create a new column for one-hot encoding
df_merged_new['Defect_encoded'] = df_merged_new['Defect'].apply(lambda x: x if x in values_to_encode else 'other_defects')

# Perform one-hot encoding on the new column
df_encoded = pd.get_dummies(df_merged_new['Defect_encoded'], prefix='Defect', dummy_na=False)

# Concatenate the encoded features with the original DataFrame
df_multiclass = pd.concat([df_merged_new, df_encoded], axis=1)

# Drop the original "Defect" and "Defect_encoded" columns
df_multiclass.drop(['Defect', 'Defect_encoded'], axis=1, inplace=True)

df_multiclass

Unnamed: 0,TraceabilityCode,time_x,AmbientHumidity,time.value,AmbientTemperature,ClutchBrakeTemperatureUnitOil,ClutchBrakeWaterTemperature,CushionPumpMaxPower1,CushionPumpMaxPower2,CushionPumpMaxPower3,...,SlidePositionUp,SlideSpeedDown,SlideSpeedUp,TotalMaxForce,TotalPressForceValue,Has_Defect,Defect_Deformacja (deformation),Defect_Odciśnięte miejsce (marks),Defect_Pikos (marks/contamination),Defect_other_defects
0,1010122318135217,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,0,1,0,0
1,1010122318135259,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,0,0,0,1
2,1010122318135412,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,0,1,0,0
3,1010122318141922,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,0,1,0,0
4,1010122319141031,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143989,1012623048222115,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,0,0,0,0,1
143990,1012723011144348,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,1,0,0,0
143991,1012723011145624,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,1,0,0,0
143992,1012823068072827,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,1,0,0,0


In [22]:
df_multiclass.loc[df_multiclass['Has_Defect'] == 0, 'Defect_other_defects'] = 0

In [23]:
df_multiclass

Unnamed: 0,TraceabilityCode,time_x,AmbientHumidity,time.value,AmbientTemperature,ClutchBrakeTemperatureUnitOil,ClutchBrakeWaterTemperature,CushionPumpMaxPower1,CushionPumpMaxPower2,CushionPumpMaxPower3,...,SlidePositionUp,SlideSpeedDown,SlideSpeedUp,TotalMaxForce,TotalPressForceValue,Has_Defect,Defect_Deformacja (deformation),Defect_Odciśnięte miejsce (marks),Defect_Pikos (marks/contamination),Defect_other_defects
0,1010122318135217,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,0,1,0,0
1,1010122318135259,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,0,0,0,1
2,1010122318135412,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,0,1,0,0
3,1010122318141922,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,0,1,0,0
4,1010122319141031,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143989,1012623048222115,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,0,0,0,0,0
143990,1012723011144348,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,1,0,0,0
143991,1012723011145624,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,1,0,0,0
143992,1012823068072827,,19.0,,26.0,49.0,30.0,45.0,48.0,45.0,...,14.0,-315.0,181.0,31.0,30.0,1,1,0,0,0


In [26]:
df_multiclass.isna().sum()

TraceabilityCode                      0
AmbientHumidity                       0
AmbientTemperature                    0
ClutchBrakeTemperatureUnitOil         0
ClutchBrakeWaterTemperature           0
                                     ..
TotalPressForceValue                  0
Defect_Deformacja (deformation)       0
Defect_Odciśnięte miejsce (marks)     0
Defect_Pikos (marks/contamination)    0
Defect_other_defects                  0
Length: 70, dtype: int64

In [24]:
#Drop has_defect
df_multiclass = df_multiclass.drop(['Has_Defect'], axis=1)

In [25]:
#drop time_x and time.value
df_multiclass = df_multiclass.drop(['time_x','time.value'], axis=1)

In [26]:
df_multiclass

Unnamed: 0,TraceabilityCode,AmbientHumidity,AmbientTemperature,ClutchBrakeTemperatureUnitOil,ClutchBrakeWaterTemperature,CushionPumpMaxPower1,CushionPumpMaxPower2,CushionPumpMaxPower3,CushionPumpMeanPower1,CushionPumpMeanPower2,...,SlidePositionDown,SlidePositionUp,SlideSpeedDown,SlideSpeedUp,TotalMaxForce,TotalPressForceValue,Defect_Deformacja (deformation),Defect_Odciśnięte miejsce (marks),Defect_Pikos (marks/contamination),Defect_other_defects
0,1010122318135217,19.0,26.0,49.0,30.0,45.0,48.0,45.0,47.0,50.0,...,-118.0,14.0,-315.0,181.0,31.0,30.0,0,1,0,0
1,1010122318135259,19.0,26.0,49.0,30.0,45.0,48.0,45.0,47.0,50.0,...,-118.0,14.0,-315.0,181.0,31.0,30.0,0,0,0,1
2,1010122318135412,19.0,26.0,49.0,30.0,45.0,48.0,45.0,47.0,50.0,...,-118.0,14.0,-315.0,181.0,31.0,30.0,0,1,0,0
3,1010122318141922,19.0,26.0,49.0,30.0,45.0,48.0,45.0,47.0,50.0,...,-118.0,14.0,-315.0,181.0,31.0,30.0,0,1,0,0
4,1010122319141031,19.0,26.0,49.0,30.0,45.0,48.0,45.0,47.0,50.0,...,-118.0,14.0,-315.0,181.0,31.0,30.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143989,1012623048222115,19.0,26.0,49.0,30.0,45.0,48.0,45.0,47.0,50.0,...,-23.0,14.0,-315.0,181.0,31.0,30.0,0,0,0,0
143990,1012723011144348,19.0,26.0,49.0,30.0,45.0,48.0,45.0,47.0,50.0,...,-118.0,14.0,-315.0,181.0,31.0,30.0,1,0,0,0
143991,1012723011145624,19.0,26.0,49.0,30.0,45.0,48.0,45.0,47.0,50.0,...,-118.0,14.0,-315.0,181.0,31.0,30.0,1,0,0,0
143992,1012823068072827,19.0,26.0,49.0,30.0,45.0,48.0,45.0,47.0,50.0,...,-118.0,14.0,-315.0,181.0,31.0,30.0,1,0,0,0


In [27]:
# Define y (target variable)
y = df_multiclass.filter(regex='^Defect')

# Define X (feature matrix)
X = df_multiclass.drop(y.columns, axis=1)

In [28]:
y

Unnamed: 0,Defect_Deformacja (deformation),Defect_Odciśnięte miejsce (marks),Defect_Pikos (marks/contamination),Defect_other_defects
0,0,1,0,0
1,0,0,0,1
2,0,1,0,0
3,0,1,0,0
4,0,0,0,1
...,...,...,...,...
143989,0,0,0,0
143990,1,0,0,0
143991,1,0,0,0
143992,1,0,0,0


## Random Forest:

In [29]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)

In [30]:
from sklearn.metrics import classification_report

In [31]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.50      0.07      0.12        15
           2       0.62      0.16      0.26        31
           3       0.75      0.07      0.14        40

   micro avg       0.60      0.08      0.14       115
   macro avg       0.47      0.08      0.13       115
weighted avg       0.49      0.08      0.13       115
 samples avg       0.00      0.00      0.00       115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Weighted Average: The weighted average takes into account both the performance of each class and the class distribution in the test set. If your classes are imbalanced (i.e., different classes have different numbers of samples), the weighted average provides a more representative overall performance metric. It gives more weight to classes with a larger number of samples.

Based on the provided classification report, we can interpret the performance of the model as follows:

Precision: Precision measures the proportion of correctly predicted instances for each class. In this case, the precision for class 0 is 0.00, meaning that no instances were correctly predicted as class 0. The precision for class 1 is 0.50, indicating that 50% of the instances predicted as class 1 were correct. The precision for class 2 is 0.62, indicating that 62% of the instances predicted as class 2 were correct. The precision for class 3 is 0.75, suggesting that 75% of the instances predicted as class 3 were correct.

Recall: Recall, also known as sensitivity, measures the proportion of true instances correctly predicted for each class. The recall for class 0 is 0.00, indicating that no instances of class 0 were correctly predicted. The recall for class 1 is 0.07, meaning that only 7% of the true instances of class 1 were correctly predicted. The recall for class 2 is 0.16, suggesting that 16% of the true instances of class 2 were correctly predicted. The recall for class 3 is 0.07, indicating that only 7% of the true instances of class 3 were correctly predicted.

F1-score: The F1-score is the harmonic mean of precision and recall, providing a balanced measure of a model's performance. The F1-score for class 0 is 0.00, as there were no instances correctly predicted for this class. The F1-score for class 1 is 0.12, representing a balance between precision and recall for this class. The F1-score for class 2 is 0.26, indicating a relatively better balance between precision and recall compared to class 1. The F1-score for class 3 is 0.14, also indicating a balance between precision and recall, although the values are relatively low.

Support: Support refers to the number of instances in each class. The support for class 0 is 29, for class 1 is 15, for class 2 is 31, and for class 3 is 40.

Micro Average: The micro average calculates the metrics globally by considering all instances together. The micro average precision is 0.60, micro average recall is 0.08, and micro average F1-score is 0.14. These values indicate the overall performance of the model across all classes.

Macro Average: The macro average calculates the metrics independently for each class and then takes the average. The macro average precision is 0.47, macro average recall is 0.08, and macro average F1-score is 0.13. These values provide an average performance measure across all classes, giving equal importance to each class.

Weighted Average: The weighted average calculates the metrics based on the support (number of instances) for each class. The weighted average precision is 0.49, weighted average recall is 0.08, and weighted average F1-score is 0.13. These values provide an average performance measure, with higher weight given to classes with more instances.

Samples Average: The samples average calculates the metrics by considering each instance individually rather than aggregating at the class level. The samples average precision, recall, and F1-score are all 0.00, indicating that none of the instances were correctly predicted across all classes.

In summary, the classification report shows that the model's performance is generally low, with low precision, recall, and F1-scores for most classes. It suggests that the model is struggling to accurately predict the different

### With hyperparameter tuning using RandomizedSearch: 

In [32]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
}

# Create a Random Forest classifier
rf_model = RandomForestClassifier()

# Perform randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid, n_iter=10, cv=5)
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Create a new Random Forest classifier with the best hyperparameters
best_rf_model = RandomForestClassifier(**best_params)

# Train the model with the best hyperparameters
best_rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)
y_pred_proba = best_rf_model.predict_proba(X_test)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.50      0.07      0.12        15
           2       0.62      0.16      0.26        31
           3       0.75      0.07      0.14        40

   micro avg       0.64      0.08      0.14       115
   macro avg       0.47      0.08      0.13       115
weighted avg       0.49      0.08      0.13       115
 samples avg       0.00      0.00      0.00       115



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## XGBoost:

In [None]:
import xgboost as xgb

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))

### With hyperparametertuning using RandomizedSearch:

In [24]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report
import xgboost as xgb

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}

# Create an XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, n_iter=10, cv=5)
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Create a new XGBoost classifier with the best hyperparameters
best_xgb_model = xgb.XGBClassifier(**best_params)

# Train the model with the best hyperparameters
best_xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_xgb_model.predict(X_test)
y_pred_proba = best_xgb_model.predict_proba(X_test)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))

KeyboardInterrupt: 

#### The multiclass models seem to perform in general poorly probably because of insufficient training data, therefore it's probably better in this case to stick to predicting for each row if it has a defect (any type) or not only. 