In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import joblib

# Load the telemetry data from CSV
data = pd.read_csv('vehicle_telemetry.csv')

# Feature Engineering: Adding new feature (mileage per wear_tear ratio)
data['mileage_wear_ratio'] = data['mileage'] / data['wear_tear']

# Feature selection
X = data[['mileage', 'engine_temp', 'oil_pressure', 'wear_tear', 'mileage_wear_ratio']]
y = data['maintenance_needed']

# Handle class imbalance using SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optional: PCA for dimensionality reduction (can experiment with the number of components)
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Initialize XGBoost model (you can try RandomForestClassifier or others too)
model = XGBClassifier(random_state=42)

# Set up hyperparameter tuning (RandomizedSearch for faster exploration)
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# Perform RandomizedSearchCV for faster hyperparameter tuning
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=50,
                                   cv=5, random_state=42, n_jobs=-1, verbose=2)
random_search.fit(X_train_pca, y_train)

# Get the best model after hyperparameter tuning
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_pca)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Best Parameters:', random_search.best_params_)
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Save the trained model for future predictions
joblib.dump(best_model, 'vehicle_maintenance_improved_model_xgboost.pkl')

# To load and use the model later:
# loaded_model = joblib.load('vehicle_maintenance_improved_model_xgboost.pkl')
# predictions = loaded_model.predict(new_data)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Accuracy: 0.71
Best Parameters: {'subsample': 0.9, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2, 'colsample_bytree': 1.0}
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.80      0.73        10
           1       0.78      0.64      0.70        11

    accuracy                           0.71        21
   macro avg       0.72      0.72      0.71        21
weighted avg       0.72      0.71      0.71        21

Confusion Matrix:
[[8 2]
 [4 7]]


['vehicle_maintenance_improved_model_xgboost.pkl']