In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [31]:
df = pd.read_csv('train.csv')

In [32]:
df.drop(columns=['id', 'Unnamed: 0'], inplace=True)

In [33]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['Type of Travel', 'Gender', 'Customer Type', 'Class', 'satisfaction']
encoding_dict = {}
le = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])
    encoding_dict[col] = dict(zip(le.classes_, range(len(le.classes_))))
print("\nEncoding Mappings:")
for col, mapping in encoding_dict.items():
    print(f"{col}: {mapping}")


Encoding Mappings:
Type of Travel: {'Business travel': 0, 'Personal Travel': 1}
Gender: {'Female': 0, 'Male': 1}
Customer Type: {'Loyal Customer': 0, 'disloyal Customer': 1}
Class: {'Business': 0, 'Eco': 1, 'Eco Plus': 2}
satisfaction: {'neutral or dissatisfied': 0, 'satisfied': 1}


In [34]:
df.dropna(axis=0, inplace=True)

In [35]:
from itertools import combinations

def create_interactions(df, feature_list):
    for feat1, feat2, feat3 in combinations(feature_list, 3):
        df[f'{feat1}_x_{feat2}'] = (df[feat1] ** 2) * (df[feat2] ** 4)
        df[f'{feat1}_x_{feat2}_x_{feat3}'] = (df[feat1] ** 4) * (df[feat2] ** 4) * (df[feat3] ** 3)
    return df

In [36]:
def remove_collinearity_with_target(df, target_col, threshold=0.90):
    X = df.drop(columns=[target_col])
    target_corr = abs(X.corrwith(df[target_col]))
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = []
    
    for col in upper.columns:
        correlated_features = upper[col][upper[col] > threshold].index
        for feat in correlated_features:
            if target_corr[col] < target_corr[feat]:
                to_drop.append(col)
                break
    
    return df.drop(columns=to_drop)

In [37]:
features = ['Class', 'Type of Travel', 'Seat comfort', 'Inflight entertainment', 'Online boarding', 'Customer Type']
df = create_interactions(df, features)

In [38]:
df = remove_collinearity_with_target(df, 'satisfaction', 0.80)

In [39]:
def remove_outliers(df, method="zscore", threshold=3):
    df_clean = df.copy()
    
    if method == "zscore":
        z_scores = np.abs((df_clean - df_clean.mean()) / df_clean.std())
        df_clean = df_clean[(z_scores < threshold).all(axis=1)]
    
    elif method == "iqr":
        Q1 = df_clean.quantile(0.25)
        Q3 = df_clean.quantile(0.75)
        IQR = Q3 - Q1
        df_clean = df_clean[~((df_clean < (Q1 - 1.5 * IQR)) | (df_clean > (Q3 + 1.5 * IQR))).any(axis=1)]
    
    elif method == "percentile":
        lower, upper = df_clean.quantile(0.01), df_clean.quantile(0.99)
        df_clean = df_clean[(df_clean >= lower) & (df_clean <= upper)].dropna()

    return df_clean


df = remove_outliers(df, method="iqr")


In [40]:
X = df.drop(columns=['satisfaction'])
y = df['satisfaction']

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the baseline model
baseline_model = LogisticRegression(random_state=42, max_iter=100000)
baseline_model.fit(X_train, y_train)

# Make predictions
y_pred = baseline_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, baseline_model.predict_proba(X_test)[:, 1])
report = classification_report(y_test, y_pred)

print(f"Baseline Model Accuracy: {accuracy:.4f}")
print(f"Baseline Model ROC AUC: {roc_auc:.4f}")
print("\nClassification Report:")
print(report)

Baseline Model Accuracy: 0.9145
Baseline Model ROC AUC: 0.9646

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1556
           1       0.95      0.93      0.94      3588

    accuracy                           0.91      5144
   macro avg       0.90      0.90      0.90      5144
weighted avg       0.92      0.91      0.91      5144



In [42]:
import os
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, r2_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

# Limit CPU usage
os.environ["LOKY_MAX_CPU_COUNT"] = "4"

# Define models and their parameters
lgbm_params = {'n_estimators': 291, 'learning_rate': 0.033236472434062025, 'max_depth': 14, 'num_leaves': 73, 'min_child_samples': 15, 'subsample': 0.6452608030046519, 'colsample_bytree': 0.7351330578836698}
rf_params = {'n_estimators': 383, 'max_depth': 15, 'min_samples_split': 5, 'max_features': 'sqrt', 'criterion': 'entropy'}
cb_params = {'iterations': 485, 'learning_rate': 0.02451154055716572, 'depth': 8, 'l2_leaf_reg': 1.312879755477539}
xgb_params = {'max_depth': 15, 'learning_rate': 0.12204550084705108, 'n_estimators': 61, 'min_child_weight': 1.9190931783540948, 'subsample': 0.9329682809203258, 'colsample_bytree': 0.7196095681095573, 'gamma': 2.3019670579460234}

models = {
    'XGBoost': xgb.XGBClassifier(**xgb_params, random_state=42),
    'Random Forest': RandomForestClassifier(**rf_params, random_state=42),
    'LightGBM': lgb.LGBMClassifier(**lgbm_params, random_state=42, verbose=-1),
    'CatBoost': CatBoostClassifier(**cb_params, random_seed=42, verbose=False)
}

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Dictionary to store results
results = {'Model': [], 'Accuracy': [], 'ROC AUC': [], 'R2 Score': []}

# Perform cross-validation
for name, model in models.items():
    print(f"\n{'-'*50}")
    print(f"{name} Cross-Validation Results:")
    
    acc_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    roc_scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
    r2_scores = cross_val_score(model, X, y, cv=cv, scoring='r2', n_jobs=-1)
    
    results['Model'].append(name)
    results['Accuracy'].append(np.mean(acc_scores))
    results['ROC AUC'].append(np.mean(roc_scores))
    results['R2 Score'].append(np.mean(r2_scores))
    
    print(f"Accuracy: {np.mean(acc_scores):.4f} (+/- {np.std(acc_scores):.4f})")
    print(f"ROC AUC: {np.mean(roc_scores):.4f} (+/- {np.std(roc_scores):.4f})")
    print(f"R2 Score: {np.mean(r2_scores):.4f} (+/- {np.std(r2_scores):.4f})")

# Create comparison DataFrame
models_comparison = pd.DataFrame(results).sort_values('Accuracy', ascending=False)

print("\n" + "="*50)
print("Models Comparison:")
print(models_comparison.to_string(index=False))


--------------------------------------------------
XGBoost Cross-Validation Results:
Accuracy: 0.9851 (+/- 0.0020)
ROC AUC: 0.9982 (+/- 0.0002)
R2 Score: 0.9292 (+/- 0.0093)

--------------------------------------------------
Random Forest Cross-Validation Results:
Accuracy: 0.9794 (+/- 0.0040)
ROC AUC: 0.9970 (+/- 0.0005)
R2 Score: 0.9022 (+/- 0.0190)

--------------------------------------------------
LightGBM Cross-Validation Results:
Accuracy: 0.9860 (+/- 0.0023)
ROC AUC: 0.9983 (+/- 0.0001)
R2 Score: 0.9339 (+/- 0.0108)

--------------------------------------------------
CatBoost Cross-Validation Results:
Accuracy: 0.9865 (+/- 0.0028)
ROC AUC: 0.9987 (+/- 0.0002)
R2 Score: 0.9362 (+/- 0.0133)

Models Comparison:
        Model  Accuracy  ROC AUC  R2 Score
     CatBoost  0.986547 0.998748  0.936247
     LightGBM  0.986041 0.998316  0.933852
      XGBoost  0.985069 0.998163  0.929246
Random Forest  0.979354 0.996961  0.902159
