# Import Libaries

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    f1_score,
    roc_auc_score
)

# Load Dataset

In [2]:
dataset = pd.read_csv(r"D:\AI course Tamil\CapstoneProject\2.Data Preprocessing\Preprocessed_project_risk.csv")

# Sepatate target and features

In [3]:
X = dataset.drop(columns=["Risk_Level"])
y = dataset["Risk_Level"]

# Encode categorical features

In [4]:
X_encoded = pd.get_dummies(X, drop_first=True)
X_encoded = X_encoded.replace({True: 1, False: 0})

# Train test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=1/3,
    random_state=42,
    stratify=y
)

# Build the PIPELINE

In [8]:
pipe = Pipeline([
    ("variance", VarianceThreshold(threshold=0.01)),  # 50 → 48
    ("scaler", StandardScaler()),                     # scale
    ("pca", PCA(n_components=18)),                    # 48 → 18
    ("lr", LogisticRegression(random_state=0))
])

# Train the model

In [9]:
pipe.fit(X_train, y_train)

# Prediction

In [10]:
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)

# Model Evaluation

In [12]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

f1 = f1_score(y_test, y_pred, average="weighted")
print("Weighted F1 Score:", f1)

roc_auc = roc_auc_score(
    y_test,
    y_prob,
    multi_class="ovo",
    average="weighted"
)
print("ROC AUC:", roc_auc)


Confusion Matrix:
 [[133  64   1  56]
 [ 74 102  13 156]
 [  3   6 121 139]
 [ 28  62  61 315]]

Classification Report:

              precision    recall  f1-score   support

    Critical       0.56      0.52      0.54       254
        High       0.44      0.30      0.35       345
         Low       0.62      0.45      0.52       269
      Medium       0.47      0.68      0.56       466

    accuracy                           0.50      1334
   macro avg       0.52      0.49      0.49      1334
weighted avg       0.51      0.50      0.49      1334

Weighted F1 Score: 0.4934194873122126
ROC AUC: 0.7660696277096924


# Save the ENTIRE PIPELINE

In [15]:
pickle.dump(pipe, open("final_pipeline_logistic_pca.sav", "wb"))

# Load model

In [16]:
loaded_model = pickle.load(open("final_pipeline_logistic_pca.sav", "rb"))

In [17]:
sample_input = pd.DataFrame(
    [[
        32,1526276.55,32,9.7,16,3,3,1.05,0.16,0.84,
        2,1.9,4,0,0.82,0.55,2.66,0.98,0.29,6,
        0.8,0,10,5,1,0,0,4,3,4,0,3,0,3,
        0,1,3,2,2,2,2,0,3,0,0,0,2,2,0,2
    ]],
    columns=X_encoded.columns
)

In [21]:
loaded_model.classes_

array(['Critical', 'High', 'Low', 'Medium'], dtype=object)

In [22]:
probs = loaded_model.predict_proba(sample_input)[0]
labels = loaded_model.classes_

prob_with_labels = dict(zip(labels, probs))
prob_with_labels

{'Critical': 0.1721917054825568,
 'High': 0.30753947527861497,
 'Low': 0.09635186995117563,
 'Medium': 0.4239169492876526}

In [23]:
prediction = loaded_model.predict(sample_input)
print("Predicted Class:", prediction)

Predicted Class: ['Medium']
