In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

In [81]:
# Import the excel spreadsheets in the data folder
installer_df = pd.read_excel('../data/Installer.xlsx')
involver_df = pd.read_excel('../data/Involver.xlsx')

  warn(msg)
  warn(msg)


In [82]:
# Attempt to merge using a subset of key columns
merge_on_columns = ['Site', 'Vessel_Name', 'Wo_No']

df = pd.concat([installer_df, involver_df], axis=0)

# feature_columns = ['Object', 'Group', 'Symptom', 'Error_Cause', 'Cause_Details', 'Error_Class', 'Discovery', 'Completion_Note', 'Action_Taken', 'Work_Description', 'Directive']
feature_columns = ['Object', 'Group', 'Object_Type','Completion_Note', 'Work_Description', 'Directive']
target_column = 'EBS1'

# Filter the dataframe for the selected columns
df = df[feature_columns + [target_column]].dropna()

In [83]:
# Encode the target column
label_encoder = LabelEncoder()
df[target_column] = label_encoder.fit_transform(df[target_column].astype(str))

In [84]:
# Remove classes with less than 2 items
class_counts = df[target_column].value_counts()
df = df[df[target_column].isin(class_counts[class_counts >= 2].index)]

# Create the vectorizer
vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=20000, stop_words='english')

# Split the data into its constituent parts (description and status) and then create a prediction
X = df[feature_columns]
y = df[target_column]

# Transform the features using CountVectorizer
X_vec = vectorizer.fit_transform(X.apply(lambda x: ' '.join(x), axis=1))

# Train test split with stratification to ensure all classes are represented
X_train, X_valid, y_train, y_valid = train_test_split(X_vec, y, test_size=0.2, random_state=30, stratify=y)

# Re-encode the target variable after splitting
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_valid = label_encoder.transform(y_valid)

# Create XGBoost Model with initial parameters
xgb_model = XGBClassifier(
    objective='multi:softmax',
    use_label_encoder=False,  # Avoid using label encoder inside XGBoost
    eval_metric='mlogloss',  # Specify evaluation metric
    random_state=42
)

# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5, 7, 9]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(xgb_model, param_grid, n_jobs=-1, cv=3, scoring='accuracy', verbose=2, n_iter=50, random_state=42)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.



In [85]:
# Print the best parameters and score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation accuracy: ", random_search.best_score_)

# Evaluate on the test set with the best model
best_model = random_search.best_estimator_
y_pred_best = best_model.predict(X_valid)
test_accuracy_best = accuracy_score(y_valid, y_pred_best)
print("Test accuracy with best model: ", test_accuracy_best)

# Print the classification report for the best model
print("Classification Report with best model:")
print(classification_report(y_valid, y_pred_best))

Best parameters found:  {'subsample': 0.6, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.2, 'colsample_bytree': 1.0}
Best cross-validation accuracy:  0.8838509316770186
Test accuracy with best model:  0.8925619834710744
Classification Report with best model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       0.00      0.00      0.00         1
           2       0.50      0.33      0.40         3
           3       0.96      1.00      0.98        27
           5       1.00      1.00      1.00         1
           6       0.50      1.00      0.67         2
           7       1.00      0.33      0.50         3
           8       1.00      1.00      1.00         3
           9       1.00      0.50      0.67         2
          10       0.92      1.00      0.96        34
          12       1.00      0.50      0.67         2
          13       1.00      1.00      1.00         1
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
