In [None]:
# 🔧 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score

# 🧼 Load Dataset
df = pd.read_csv("kiranaRO_train.csv")
data = df.copy()

# 🧹 Drop Irrelevant Columns
data.drop(columns=['InvoiceNo', 'StockCode', 'Description'], inplace=True)

# 🗓️ Feature Engineering on InvoiceDate
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
data['InvoiceMonth'] = data['InvoiceDate'].dt.month
data['InvoiceDay'] = data['InvoiceDate'].dt.day
data['InvoiceHour'] = data['InvoiceDate'].dt.hour
data.drop(columns='InvoiceDate', inplace=True)

# 🛠️ Missing Value Imputation
num_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = data.select_dtypes(include=['object']).columns.tolist()

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

data[num_cols] = num_imputer.fit_transform(data[num_cols])
data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])

# 🔤 Encode Categorical Variables
le = LabelEncoder()
for col in cat_cols:
    data[col] = le.fit_transform(data[col])

# 🚫 Remove Outliers with Isolation Forest
iso = IsolationForest(contamination=0.02, random_state=42)
outlier_pred = iso.fit_predict(data[num_cols])
data = data[outlier_pred == 1]

# 🎯 Create Target Variable
threshold = data['Quantity'].quantile(0.95)
data['target'] = (data['Quantity'] > threshold).astype(int)
data.drop(columns=['Quantity'], inplace=True)

# 🧪 Train-Test Split
X = data.drop(columns='target')
y = data['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# 🤖 Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# 📊 Train & Evaluate
for name, model in models.items():
    print(f"\n🚀 Model: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    print("F1 Score:", f1_score(y_test, y_pred))
    print("AUC-ROC:", roc_auc_score(y_test, y_prob))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))






🚀 Model: Logistic Regression
F1 Score: 0.0
AUC-ROC: 0.7495481345244169
Confusion Matrix:
 [[80957     0]
 [ 4016     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98     80957
           1       0.00      0.00      0.00      4016

    accuracy                           0.95     84973
   macro avg       0.48      0.50      0.49     84973
weighted avg       0.91      0.95      0.93     84973


🚀 Model: Random Forest


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Score: 0.4949308755760368
AUC-ROC: 0.9063962906480234
Confusion Matrix:
 [[80074   883]
 [ 2405  1611]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     80957
           1       0.65      0.40      0.49      4016

    accuracy                           0.96     84973
   macro avg       0.81      0.70      0.74     84973
weighted avg       0.96      0.96      0.96     84973


🚀 Model: Gradient Boosting
