In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import catboost as cb

In [2]:
# Load the failure data and correctly identify 'not failed' months by filling NaN values
failure_data = pd.read_csv('failure_data_masked.csv')
failure_data['recordmonth'] = pd.to_datetime(failure_data['recordmonth'])
failure_data['failurereason'] = failure_data['failurereason'].fillna('not failed')

In [3]:
def create_and_baseline_features(file_path, id_cols, failure_data, chunksize):
    """
    Loads sensor data in chunks, merges with failure data, computes a per-wheel baseline,
    and returns the full DataFrame. This is memory-efficient.
    """
    print(f"Processing {file_path} in chunks for merge and baseline...")
    
    lean_failure_data = failure_data[id_cols + ['recordmonth', 'failurereason']]
    merged_chunks = []
    
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        chunk['recordmonth'] = pd.to_datetime(chunk['traindate']).dt.to_period('M').dt.to_timestamp()
        merged_chunk = pd.merge(chunk, lean_failure_data, on=id_cols + ['recordmonth'], how='left')
        merged_chunks.append(merged_chunk)

    df = pd.concat(merged_chunks, ignore_index=True)
    numeric_cols = df.select_dtypes(include=np.number).columns
    numeric_features = [col for col in numeric_cols if col not in id_cols and col != 'traindate']
    baseline_df = df[df['failurereason'] == 'not failed'].copy()
    baseline_means = baseline_df.groupby(id_cols)[numeric_features].mean()
    df = df.drop(columns=['failurereason'])
    
    return df, baseline_means

In [4]:

def create_anomaly_features(df, baseline_df, id_cols):
    """
    Computes anomaly features by comparing monthly readings to a per-wheel baseline.
    """
    print(f"Creating anomaly features for {id_cols} data...")
    df_with_baseline = pd.merge(df, baseline_df, on=id_cols, how='left', suffixes=('', '_baseline'))
    numeric_features = [col for col in baseline_df.columns if col not in id_cols]
    anomaly_values = []
    
    for name, group in df_with_baseline.groupby(id_cols):
        group_anomalies = {}
        for col in numeric_features:
            baseline_col = f"{col}_baseline"
            if baseline_col in group.columns and not group[baseline_col].isnull().all():
                deviation = (group[col] - group[baseline_col]).abs().dropna()
                
                if not deviation.empty:
                    monthly_anomalies_idx = deviation.groupby(group['recordmonth']).idxmax().dropna()
                    
                    if not monthly_anomalies_idx.empty:
                        monthly_anomalies = group.loc[monthly_anomalies_idx]
                        group_anomalies[f"{col}_anomaly"] = monthly_anomalies.set_index('recordmonth')[col].abs()
        
        if group_anomalies:
            anomalies_df = pd.DataFrame(group_anomalies).reset_index()
            for col in id_cols:
                anomalies_df[col] = name[id_cols.index(col)]
            anomaly_values.append(anomalies_df)

    if anomaly_values:
        return pd.concat(anomaly_values, ignore_index=True)
    else:
        return pd.DataFrame()


In [5]:
def create_aggregated_features(file_path, id_cols, agg_config, chunksize):
    """
    Loads a sensor data file in chunks and creates aggregated features.
    """
    print(f"Processing {file_path} in chunks...")
    all_chunks_list = []
    
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        all_chunks_list.append(chunk)

    df = pd.concat(all_chunks_list, ignore_index=True)
    df['recordmonth'] = pd.to_datetime(df['traindate']).dt.to_period('M').dt.to_timestamp()
    result = df.groupby(id_cols + ['recordmonth']).agg(agg_config)
    result.columns = ['_'.join(col).strip() for col in result.columns.values]
    
    return result.reset_index()


In [6]:
def create_lag_features(df, id_cols, numeric_cols, n_lags=6):
    """
    Creates lag features by building a list of new columns and concatenating at the end.
    This avoids DataFrame fragmentation and improves performance.
    """
    print("Creating lag features...")
    df = df.sort_values(by=id_cols + ['recordmonth'])
    
    new_lag_cols = []
    
    for col in numeric_cols:
        for lag in range(1, n_lags + 1):
            lag_col_name = f"{col}_lag{lag}"
            shifted_series = df.groupby(id_cols)[col].shift(lag)
            new_lag_cols.append(shifted_series.rename(lag_col_name))
    
    if new_lag_cols:
        lags_df = pd.concat(new_lag_cols, axis=1)
        df = pd.concat([df, lags_df], axis=1)
    
    return df


In [7]:

# Define the unique identifiers for each dataset and chunk size
wild_id = ['equipmentnumber', 'truck', 'axle', 'side']
wpd_id = ['equipmentnumber', 'truck', 'axle', 'side']
thd_id = ['equipmentnumber', 'truck']
CHUNK_SIZE = 100000

# --- Feature Engineering Step 1: Anomaly Features ---
wild_df, wild_baseline = create_and_baseline_features('wild_data_masked.csv', wild_id, failure_data, CHUNK_SIZE)
wpd_df, wpd_baseline = create_and_baseline_features('wpd_data_masked.csv', wpd_id, failure_data, CHUNK_SIZE)
thd_df, thd_baseline = create_and_baseline_features('thd_data_masked.csv', thd_id, failure_data, CHUNK_SIZE)

wild_anomalies = create_anomaly_features(wild_df, wild_baseline.reset_index(), wild_id)
wpd_anomalies = create_anomaly_features(wpd_df, wpd_baseline.reset_index(), wpd_id)
thd_anomalies = create_anomaly_features(thd_df, thd_baseline.reset_index(), thd_id)

all_anomalies = pd.merge(wpd_anomalies, thd_anomalies, on=['recordmonth', 'equipmentnumber', 'truck'], how='outer')
all_anomalies = pd.merge(all_anomalies, wild_anomalies, on=['recordmonth', 'equipmentnumber', 'truck', 'axle', 'side'], how='outer')

Processing wild_data_masked.csv in chunks for merge and baseline...
Processing wpd_data_masked.csv in chunks for merge and baseline...
Processing thd_data_masked.csv in chunks for merge and baseline...
Creating anomaly features for ['equipmentnumber', 'truck', 'axle', 'side'] data...
Creating anomaly features for ['equipmentnumber', 'truck', 'axle', 'side'] data...
Creating anomaly features for ['equipmentnumber', 'truck'] data...


In [8]:

# --- Feature Engineering Step 2: Aggregated Features ---
best_features = ['dynamicvertical', 'flangethickness', 'dynamicratio', 'flangeheight', 'rimthickness', 'huntingindex']
wild_features = [f for f in ['dynamicvertical', 'dynamicratio'] if f in best_features]
wpd_features = [f for f in ['flangethickness', 'flangeheight', 'rimthickness'] if f in best_features]
thd_features = [f for f in ['huntingindex'] if f in best_features]

wild_agg_config = {f: ['mean', 'max', 'std', 'min', 'median'] for f in wild_features}
wpd_agg_config = {f: ['mean', 'max', 'std', 'min', 'median'] for f in wpd_features}
thd_agg_config = {f: ['mean', 'max', 'std', 'min', 'median'] for f in thd_features}

wild_agg = create_aggregated_features('wild_data_masked.csv', wild_id, wild_agg_config, CHUNK_SIZE)
wpd_agg = create_aggregated_features('wpd_data_masked.csv', wpd_id, wpd_agg_config, CHUNK_SIZE)
thd_agg = create_aggregated_features('thd_data_masked.csv', thd_id, thd_agg_config, CHUNK_SIZE)

historical_features = pd.merge(wpd_agg, thd_agg, on=['recordmonth', 'equipmentnumber', 'truck'], how='outer')
historical_features = pd.merge(historical_features, wild_agg, on=['recordmonth', 'equipmentnumber', 'truck', 'axle', 'side'], how='outer')

Processing wild_data_masked.csv in chunks...
Processing wpd_data_masked.csv in chunks...
Processing thd_data_masked.csv in chunks...


In [9]:

# --- Data Merging and Preparation ---
targets_df = pd.concat([pd.read_csv('failure_data_masked.csv'), pd.read_csv('failure_data_masked_test.csv')], ignore_index=True)
targets_df['recordmonth'] = pd.to_datetime(targets_df['recordmonth'])
targets_df['id'] = targets_df.index
targets_df['source'] = ['train'] * len(pd.read_csv('failure_data_masked.csv')) + ['test'] * len(pd.read_csv('failure_data_masked_test.csv'))

# Merge anomalies and aggregated features with targets_df
merged_df = pd.merge(targets_df, all_anomalies, on=['recordmonth', 'equipmentnumber', 'truck', 'axle', 'side'], how='left')
merged_df = pd.merge(merged_df, historical_features, on=['recordmonth', 'equipmentnumber', 'truck', 'axle', 'side'], how='left')
print("Merged all datasets.")


Merged all datasets.


In [10]:

# --- Lag Feature Creation ---
numeric_feature_cols = [col for col in merged_df.columns if any(f in col for f in ['_anomaly', '_mean', '_max', '_std', '_min', '_median'])]
id_cols_for_lags = ['equipmentnumber', 'truck', 'axle', 'side']
merged_df = create_lag_features(merged_df, id_cols_for_lags, numeric_feature_cols, n_lags=6)

# Fix for DataFrame fragmentation before preparing for modeling
merged_df = merged_df.copy()


Creating lag features...


In [11]:
# Prepare data for modeling
merged_df['failurereason'] = merged_df['failurereason'].fillna('not failed')
label_map = {'high flange': 0, 'high impact': 1, 'thin flange': 2, 'other': 3, 'not failed': 4}
merged_df['target'] = merged_df['failurereason'].map(label_map)

train_df = merged_df[merged_df['source'] == 'train'].copy()
test_df = merged_df[merged_df['source'] == 'test'].copy()

all_feature_cols = [col for col in merged_df.columns if '_anomaly' in col or any(f in col for f in ['_mean', '_max', '_std', '_min', '_median']) or '_lag' in col or col == 'partmileage']

X = train_df[all_feature_cols].fillna(0)
y = train_df['target']
X_test = test_df[all_feature_cols].fillna(0)

print(f"Final data shape for training: {X.shape}")

Final data shape for training: (1550826, 400)


In [12]:
# --- Model Training and Prediction ---
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nStarting LightGBM training...")
model = lgb.LGBMClassifier(
    objective='multiclass',
    metric='multi_logloss',
    num_class=5,
    n_estimators=3000,
    learning_rate=0.01,
    num_leaves = 64,
    seed=42,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    min_child_samples=20,
    reg_alpha=1.0,
    reg_lambda=1.0,
    n_jobs=-1,
    verbose=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='multi_logloss',
    callbacks=[lgb.early_stopping(100, verbose=True)]
)



🚀 Starting LightGBM training...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[831]	valid_0's multi_logloss: 0.0282583


In [13]:
print("--- Model Evaluation ---")
val_preds = model.predict_proba(X_val)
loss = log_loss(y_val, val_preds)
print(f"Model training complete.")
print(f"Best Validation Log Loss: {loss:.4f}")
print(f"Best Iteration: {model.best_iteration_}")



--- Model Evaluation ---
Model training complete.
Best Validation Log Loss: 0.0283
Best Iteration: 831


In [14]:
print("\n Generating predictions on the test set...")
predictions = model.predict_proba(X_test)

label_map = {'high flange': 0, 'high impact': 1, 'thin flange': 2, 'other': 3, 'not failed': 4}
inv_map = {v: k for k, v in label_map.items()}
submission_cols = [inv_map[i] for i in range(5)]

submission_df = pd.DataFrame(predictions, columns=submission_cols)
submission_df.insert(0, "ID", test_df['id'].values)

submission_df = submission_df.sort_values(by='ID')
#Initially when defining this id column I have directly used concat so id will start after no of training rows which are 1550826
submission_df["ID"] = submission_df["ID"] - 1550826
submission_df.to_csv("finalsubmission1.csv", index=False)
print("\nfinalsubmission1.csv' created and sorted by ID.")


🧠 Generating predictions on the test set...

finalsubmission1.csv' created and sorted by ID.
