In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##### ----------------------------
##### 📅 2. Load Datasets
##### ----------------------------

In [None]:
sensor_df = pd.read_csv('../data/Raw/camper_sensor_logs.csv', parse_dates=['timestamp'])
maint_df = pd.read_csv('../data/Raw/maintenance_logs.csv', parse_dates=['service_date'])

In [4]:
sensor_df.head()

Unnamed: 0,log_id,camper_id,timestamp,engine_temp_c,oil_level_pct,tire_pressure_psi,battery_voltage
0,SNS000000,RV00302,2024-09-24 23:55:43,89.53,90.7,34.2,13.13
1,SNS000001,RV00376,2025-01-20 21:45:24,88.18,46.4,40.9,12.71
2,SNS000002,RV00260,2023-10-25 19:04:43,90.88,54.8,37.2,12.49
3,SNS000003,RV00937,2023-07-05 21:31:53,90.85,75.9,32.2,13.95
4,SNS000004,RV00791,2024-07-10 17:40:14,98.29,82.2,37.5,12.78


In [5]:
maint_df.head()

Unnamed: 0,maintenance_id,camper_id,service_date,service_type,cost,location_id
0,MNT00000,RV00893,2023-08-01,Battery Check,889,LOC052
1,MNT00001,RV00024,2025-03-05,Battery Check,542,LOC032
2,MNT00002,RV00091,2024-02-22,Tire Replacement,322,LOC076
3,MNT00003,RV00205,2023-06-18,Brake Check,798,LOC017
4,MNT00004,RV00127,2024-04-09,Oil Change,463,LOC059


# ----------------------------
##### 🛠️ 3. Feature Engineering
# ----------------------------

In [6]:
# Prepare recent maintenance labels --using latest sensor logs[last 180 days]

cutoff_date = sensor_df['timestamp'].max() - pd.Timedelta(days=180)
maint_recent = maint_df[maint_df['service_date'] >= cutoff_date].copy()
maint_recent.loc[:, 'maintenance_flag'] = 1

In [7]:
# latest sensor logs
latest_sensor = sensor_df.sort_values('timestamp').groupby('camper_id').tail(1).copy()

In [8]:
# Merge to recent maintenence
labeled = latest_sensor.merge(maint_recent[['camper_id', 'maintenance_flag']], on='camper_id', how='left')
#--  0 label --means campers that did not undergo maintenance in last 180 days
labeled['maintenance_flag'] = labeled['maintenance_flag'].fillna(0).astype(int)

In [9]:
labeled['maintenance_flag'].value_counts()

Unnamed: 0_level_0,count
maintenance_flag,Unnamed: 1_level_1
1,1592
0,201


In [10]:
# Balance the dataset
maint_count = labeled['maintenance_flag'].sum()
unserviced_pool = labeled[labeled['maintenance_flag'] == 0]


In [11]:
# Adjust sample size if needed
sample_size = min(maint_count, len(unserviced_pool))
unserviced_sample = unserviced_pool.sample(n=sample_size, random_state=42)
maintained_sample = labeled[labeled['maintenance_flag'] == 1]
balanced_df = pd.concat([unserviced_sample, maintained_sample])

##### ----------------------------
##### 📊 4. Exploratory Analysis
##### ----------------------------

In [12]:

for col in ['engine_temp_c', 'oil_level_pct','tire_pressure_psi', 'battery_voltage']:
  fig = px.box(balanced_df, x='maintenance_flag', y='engine_temp_c',
             title=f'{col} vs Maintenance Requirement')
  fig.update_layout(template='plotly_white')
  fig.show()

In [13]:
for col in ['engine_temp_c', 'oil_level_pct','tire_pressure_psi', 'battery_voltage']:
  fig = px.histogram(balanced_df, x=col, color='maintenance_flag',marginal='box',
                     title=f'Distribution of {col} by Maintenance Flag')
  fig.update_layout(template='plotly_white')
  fig.show()

In [14]:
# Train test split

features = ['engine_temp_c', 'oil_level_pct','tire_pressure_psi', 'battery_voltage']
X = balanced_df[features]
y = balanced_df['maintenance_flag']

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, stratify=y, random_state=42)

In [15]:
# 6. Train models & select best via ROC AUC
models = {
    'Logistic Regression': Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression())]),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(eval_metric='logloss', random_state=42)
}

best_model_name, best_auc, best_model = None, 0, None
for name, model in models.items():
    model.fit(X_train, y_train)
    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    print(f"{name} ROC AUC: {auc:.4f}")
    if auc > best_auc:
        best_model_name, best_auc, best_model = name, auc, model

print(f"\n✅ Best model selected: {best_model_name} with ROC AUC = {best_auc:.4f}")


Logistic Regression ROC AUC: 0.5105
Random Forest ROC AUC: 0.7864
XGBoost ROC AUC: 0.6371

✅ Best model selected: Random Forest with ROC AUC = 0.7864


In [16]:
# Feature Importance -- towards maintanance prediction
if best_model_name:
  print(f"✅ Best model selected: {best_model_name} with ROC AUC = {best_auc:.4f}")

  # Feature Importance
  if hasattr(best_model, 'feature_importances_'):
    feat_imp = pd.Series(best_model.feature_importances_, index=X.columns).sort_values()
    fig = px.bar(feat_imp, x=feat_imp.values, y=feat_imp.index)
    fig.update_layout(
        template='plotly_white',
        xaxis_title='Importance',
        yaxis_title=''
    )
    fig.show()

✅ Best model selected: Random Forest with ROC AUC = 0.7864


In [17]:
# Predict risk across All latest sensor logs
latest_logs = latest_sensor.copy()
latest_logs['breakdown_risk'] = best_model.predict_proba(latest_logs[features])[:, 1]


In [18]:
# Filter for at-risk unserviced/unmaintained campers[last 3 months]
at_risk = latest_logs[~latest_logs['camper_id'].isin(maint_recent['camper_id'])]

# predicted similar to ones that underwent service -- means sensors log[s] are of breakdown
at_risk = at_risk = at_risk[at_risk['breakdown_risk'] > 0.8].copy()
at_risk = at_risk[['camper_id', 'breakdown_risk']+ features]
at_risk.sort_values(by='breakdown_risk', ascending=False, inplace=True)


In [19]:
# Campers -> To be Scheduled for maintenance
print('🚨 Campers at Risk of Breakdown:')
at_risk.head()

🚨 Campers at Risk of Breakdown:


Unnamed: 0,camper_id,breakdown_risk,engine_temp_c,oil_level_pct,tire_pressure_psi,battery_voltage
95376,RV00216,0.97,97.26,65.8,36.5,12.84
43828,RV00661,0.97,94.84,32.8,32.1,12.14
33427,RV00365,0.96,86.52,50.9,31.4,12.48
65551,RV00445,0.96,88.03,75.1,35.8,12.42
57853,RV00757,0.95,89.51,50.8,40.5,12.85


In [None]:
#  Send To maintenance Team

at_risk.to_csv('../data/at_risk_breakdown_campers.csv', index=False)
print(f"✅ Successfully saved breakdown risk campers for Maintenance team: {at_risk.shape[0]} campers")

✅ Successfully saved breakdown risk campers for Maintenance team: 24 campers
