<a href="https://colab.research.google.com/github/Siva1202/Cgpa_calculator/blob/master/Anomaly_detection_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Step 1: Load the dataset
file_path = "final_tamilnadu_train_dataset.csv"  # Change this if your file name is different
df = pd.read_csv(file_path)

# Step 2: Encode categorical features
categorical_features = ["Payment_Method", "Device_Used", "Delivery_Location"]
label_encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Save label encoders
with open("label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

# Step 3: Scale numerical features
scaler = MinMaxScaler()
df["Transaction_Amount"] = scaler.fit_transform(df[["Transaction_Amount"]])

# Save the scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Step 4: Save the preprocessed dataset
df.to_csv("preprocessed_train_dataset.csv", index=False)

# Print confirmation
print("✅ Preprocessing Completed!")
print("📂 Files Saved:")
print("- preprocessed_train_dataset.csv")
print("- label_encoders.pkl")
print("- scaler.pkl")


✅ Preprocessing Completed!
📂 Files Saved:
- preprocessed_train_dataset.csv
- label_encoders.pkl
- scaler.pkl


In [2]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Step 1: Load preprocessed dataset from Step 1
file_path = "preprocessed_train_dataset.csv"
df = pd.read_csv(file_path)

# Step 2: Encode User_ID (Only User_ID needs encoding, others are already done)
user_id_encoder = LabelEncoder()
df["User_ID"] = user_id_encoder.fit_transform(df["User_ID"])

# Save User_ID Encoder
with open("user_id_encoder.pkl", "wb") as f:
    pickle.dump(user_id_encoder, f)

# Step 3: Feature Engineering - Compute Aggregates
feature_stats = {}

# Compute user-based statistics (Saved for later use)
feature_stats["avg_transaction_amount"] = df.groupby("User_ID")["Transaction_Amount"].mean()
feature_stats["transaction_frequency"] = df.groupby("User_ID")["User_ID"].count()

# Apply feature engineering
df["Avg_Transaction_Amount"] = df["User_ID"].map(feature_stats["avg_transaction_amount"])
df["Transaction_Frequency"] = df["User_ID"].map(feature_stats["transaction_frequency"])
df["Transaction_Deviation"] = abs(df["Transaction_Amount"] - df["Avg_Transaction_Amount"])

# Detect Unusual Behavior
df["Most_Frequent_Payment_Method"] = df.groupby("User_ID")["Payment_Method"].transform(lambda x: x.mode()[0])
df["Unusual_Payment_Method"] = (df["Payment_Method"] != df["Most_Frequent_Payment_Method"]).astype(int)

df["Most_Frequent_Device"] = df.groupby("User_ID")["Device_Used"].transform(lambda x: x.mode()[0])
df["Unusual_Device"] = (df["Device_Used"] != df["Most_Frequent_Device"]).astype(int)

# Time-Based Features
df["Time_Since_Last_Transaction"] = df.groupby("User_ID")["Order_Time"].diff().fillna(0)
df["Order_Time_Range"] = df["Order_Time"].apply(lambda x: 1 if x >= 21 or x < 6 else 0)  # Night (1), Day (0)

# Drop unnecessary columns
df.drop(columns=["Most_Frequent_Payment_Method", "Most_Frequent_Device"], inplace=True)

# Save Feature Engineering Statistics for Later Use
with open("feature_engineering.pkl", "wb") as f:
    pickle.dump(feature_stats, f)

# Step 4: Compute Risk Score
def calculate_risk_score(row):
    deviations = 0
    anomaly_types = []

    if row["Transaction_Deviation"] > 0.5:
        deviations += 1
        anomaly_types.append("High Transaction Deviation")
    if row["Unusual_Payment_Method"] == 1:
        deviations += 1
        anomaly_types.append("Unusual Payment Method")
    if row["Unusual_Device"] == 1:
        deviations += 1
        anomaly_types.append("Unusual Device")
    if row["Order_Time_Range"] == 1:
        deviations += 1
        anomaly_types.append("Night Order Time")

    # Calculate Risk Score
    risk_score = deviations * 0.15 + row["Transaction_Deviation"] * 0.3 + row["Time_Since_Last_Transaction"] * 0.2

    # Determine Risk Category
    if deviations >= 3:
        risk_category = "High Risk"
    elif deviations == 2:
        risk_category = "Medium Risk"
    elif deviations == 1:
        risk_category = "Low Risk"
    else:
        risk_category = "No Risk"

    return pd.Series([risk_score, risk_category, ", ".join(anomaly_types) if anomaly_types else "None"])

# Apply Risk Calculation to Dataset
df[["Risk_Score", "Risk_Category", "Anomaly_Type"]] = df.apply(calculate_risk_score, axis=1)

# Save the Final Enhanced Dataset
df.to_csv("enhanced_train_dataset_with_risk.csv", index=False)

# Print Confirmation
print("✅ Feature Engineering & Risk-Based Anomaly Detection Completed!")
print("📂 Files Saved:")
print("- enhanced_train_dataset_with_risk.csv")
print("- feature_engineering.pkl")
print("- user_id_encoder.pkl")


✅ Feature Engineering & Risk-Based Anomaly Detection Completed!
📂 Files Saved:
- enhanced_train_dataset_with_risk.csv
- feature_engineering.pkl
- user_id_encoder.pkl


In [3]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

# Load the enhanced dataset with risk scores
df = pd.read_csv("enhanced_train_dataset_with_risk.csv")

# Ensure all risk categories exist in the dataset
expected_risk_categories = ["No Risk", "Low Risk", "Medium Risk", "High Risk"]
df["Risk_Category"] = df["Risk_Category"].apply(lambda x: x if x in expected_risk_categories else "No Risk")

# Manually encode Risk_Category based on the correct order
risk_mapping = {
    "No Risk": 0,
    "Low Risk": 1,
    "Medium Risk": 2,
    "High Risk": 3
}
df["Risk_Category"] = df["Risk_Category"].map(risk_mapping)

# Encode Anomaly_Type using LabelEncoder
le = LabelEncoder()
df["Anomaly_Type"] = le.fit_transform(df["Anomaly_Type"])

# Save label encoder for Anomaly_Type
with open("anomaly_type_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# Save the final dataset with numerical categorical features
df.to_csv("final_train_dataset.csv", index=False)

# Print confirmation
print("✅ Categorical features converted to numerical values!")
print("📂 Files Saved:")
print("- final_train_dataset.csv")
print("- anomaly_type_encoder.pkl")


✅ Categorical features converted to numerical values!
📂 Files Saved:
- final_train_dataset.csv
- anomaly_type_encoder.pkl


In [4]:
# Step 1: Import Libraries
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report

# Step 2: Load the dataset
file_path = "final_train_dataset.csv"  # Change if needed
df = pd.read_csv(file_path)

# Step 3: Drop unnecessary columns (keep User_ID)
features = df.drop(columns=["Risk_Score", "Risk_Category", "Anomaly", "Order_Time","Anomaly_Type"])  # Remove targets and unnecessary columns
target_regression = df["Risk_Score"]
target_classification = df["Risk_Category"]

# Step 4: Train-test split (80% training, 20% testing)
X_train, X_test, y_train_reg, y_test_reg = train_test_split(features, target_regression, test_size=0.2, random_state=42)
_, _, y_train_cls, y_test_cls = train_test_split(features, target_classification, test_size=0.2, random_state=42)

# Step 5: Train Regression Model (Risk Score Prediction)
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train_reg)

# Step 6: Train Classification Model (Risk Category Prediction)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train_cls)

# Step 7: Make Predictions
y_pred_reg = regressor.predict(X_test)
y_pred_cls = classifier.predict(X_test)

# Step 8: Evaluate Regression Model
mae = mean_absolute_error(y_test_reg, y_pred_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = mse ** 0.5
r2 = r2_score(y_test_reg, y_pred_reg)

# Step 9: Evaluate Classification Model
accuracy = accuracy_score(y_test_cls, y_pred_cls)
classification_rep = classification_report(y_test_cls, y_pred_cls)

# Step 10: Save Models
with open("risk_score_regressor.pkl", "wb") as f:
    pickle.dump(regressor, f)

with open("risk_category_classifier.pkl", "wb") as f:
    pickle.dump(classifier, f)

# Step 11: Display Results
print("✅ Model Training Completed!")
print("\n📊 Regression Model (Risk Score Prediction):")
print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}, R² Score: {r2:.4f}")

print("\n📊 Classification Model (Risk Category Prediction):")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_rep)

print("\n📂 Models Saved:")
print("- risk_score_regressor.pkl")
print("- risk_category_classifier.pkl")


✅ Model Training Completed!

📊 Regression Model (Risk Score Prediction):
MAE: 0.0032, RMSE: 0.0142, R² Score: 0.9999

📊 Classification Model (Risk Category Prediction):
Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2938
           1       1.00      1.00      1.00      3416
           2       1.00      1.00      1.00      1430
           3       1.00      1.00      1.00       218

    accuracy                           1.00      8002
   macro avg       1.00      1.00      1.00      8002
weighted avg       1.00      1.00      1.00      8002


📂 Models Saved:
- risk_score_regressor.pkl
- risk_category_classifier.pkl


In [5]:
import pandas as pd
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score

# Load dataset
df = pd.read_csv("final_train_dataset.csv")

# Define features and targets
X = df.drop(columns=["Risk_Score", "Risk_Category", "Anomaly_Type","Anomaly", "Order_Time"])  # Features
y_reg = df["Risk_Score"]  # Regression target
y_cls = df["Risk_Category"]  # Classification target

# Split dataset
X_train, X_test, y_reg_train, y_reg_test, y_cls_train, y_cls_test = train_test_split(
    X, y_reg, y_cls, test_size=0.2, random_state=42
)

# Train XGBoost Regression Model
xgb_reg = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_reg.fit(X_train, y_reg_train)

# Train XGBoost Classification Model
xgb_cls = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_cls.fit(X_train, y_cls_train)

# Evaluate
reg_preds = xgb_reg.predict(X_test)
cls_preds = xgb_cls.predict(X_test)

print("XGBoost Regression MAE:", mean_absolute_error(y_reg_test, reg_preds))
print("XGBoost Classification Accuracy:", accuracy_score(y_cls_test, cls_preds))

# Save models
pickle.dump(xgb_reg, open("xgb_reg_model.pkl", "wb"))
pickle.dump(xgb_cls, open("xgb_cls_model.pkl", "wb"))


XGBoost Regression MAE: 0.004663468593855192
XGBoost Classification Accuracy: 0.9976255936015996


In [7]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler

# Drop unnecessary columns before training
X_train_filtered = X_train.drop(columns=["Anomaly", "Order_Time"], errors="ignore")
X_test_filtered = X_test.drop(columns=["Anomaly", "Order_Time"], errors="ignore")

# Normalize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler
pickle.dump(scaler, open("mlp_scaler.pkl", "wb"))

# Build Regression Model (Risk Score)
mlp_reg = keras.Sequential([
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(1)  # Output layer for regression
])
mlp_reg.compile(loss="mse", optimizer="adam")
mlp_reg.fit(X_train_scaled, y_reg_train, epochs=35, batch_size=32, verbose=1)

# Save model
mlp_reg.save("mlp_reg_model.keras")

# Build Classification Model (Risk Category)
mlp_cls = keras.Sequential([
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(4, activation="softmax")  # Output layer for 4 classes
])
mlp_cls.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
mlp_cls.fit(X_train_scaled, y_cls_train, epochs=20, batch_size=32, verbose=1)

# Save model
mlp_cls.save("mlp_cls_model.keras")


Epoch 1/35
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.4101
Epoch 2/35
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.0077
Epoch 3/35
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.0035
Epoch 4/35
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.0020
Epoch 5/35
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.0014
Epoch 6/35
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0010
Epoch 7/35
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 8.3977e-04
Epoch 8/35
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 7.7678e-04
Epoch 9/35
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 7.4457e-04
Epoch 10/35
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [8]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load new dataset
new_df = pd.read_csv("final_tamilnadu_prediction_dataset.csv")  # Change filename if needed

# Load encoders & scaler
with open("label_encoders.pkl", "rb") as f:
    label_encoders = pickle.load(f)

with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

# Apply label encoding on categorical features
categorical_features = ["Payment_Method", "Device_Used", "Delivery_Location"]
for col in categorical_features:
    if col in new_df.columns:
        new_df[col] = label_encoders[col].transform(new_df[col])

# Scale numerical features
if "Transaction_Amount" in new_df.columns:
    new_df["Transaction_Amount"] = scaler.transform(new_df[["Transaction_Amount"]])

# Save preprocessed dataset
new_df.to_csv("preprocessed_test_dataset.csv", index=False)

print("✅ Step 1: Preprocessing Completed! Saved as 'preprocessed_test_dataset.csv'")


✅ Step 1: Preprocessing Completed! Saved as 'preprocessed_test_dataset.csv'


In [9]:
import numpy as np

# Load preprocessed dataset
new_df = pd.read_csv("preprocessed_test_dataset.csv")

# Load user_id encoder and feature stats
with open("user_id_encoder.pkl", "rb") as f:
    user_id_encoder = pickle.load(f)

with open("feature_engineering.pkl", "rb") as f:
    feature_stats = pickle.load(f)

# Encode User_ID
new_df["User_ID"] = user_id_encoder.transform(new_df["User_ID"])

# Compute feature statistics
new_df["Avg_Transaction_Amount"] = new_df["User_ID"].map(feature_stats["avg_transaction_amount"])
new_df["Transaction_Frequency"] = new_df["User_ID"].map(feature_stats["transaction_frequency"])
new_df["Transaction_Deviation"] = abs(new_df["Transaction_Amount"] - new_df["Avg_Transaction_Amount"])

# Detect unusual behaviors
new_df["Most_Frequent_Payment_Method"] = new_df.groupby("User_ID")["Payment_Method"].transform(lambda x: x.mode()[0])
new_df["Unusual_Payment_Method"] = (new_df["Payment_Method"] != new_df["Most_Frequent_Payment_Method"]).astype(int)

new_df["Most_Frequent_Device"] = new_df.groupby("User_ID")["Device_Used"].transform(lambda x: x.mode()[0])
new_df["Unusual_Device"] = (new_df["Device_Used"] != new_df["Most_Frequent_Device"]).astype(int)

# Time-Based Features
new_df["Time_Since_Last_Transaction"] = new_df.groupby("User_ID")["Order_Time"].diff().fillna(0)
new_df["Order_Time_Range"] = new_df["Order_Time"].apply(lambda x: 1 if x >= 21 or x < 6 else 0)

# Drop unnecessary columns
new_df.drop(columns=["Most_Frequent_Payment_Method", "Most_Frequent_Device"], inplace=True)

# Save enhanced dataset
new_df.to_csv("enhanced_test_dataset.csv", index=False)

print("✅ Step 2: Feature Engineering Completed! Saved as 'enhanced_test_dataset.csv'")


✅ Step 2: Feature Engineering Completed! Saved as 'enhanced_test_dataset.csv'


In [10]:
import pandas as pd
import pickle
import xgboost as xgb
import tensorflow as tf
import numpy as np
from scipy.stats import mode

# Load the feature-engineered test dataset
test_df = pd.read_csv("enhanced_test_dataset.csv")

# Load models
with open("risk_score_regressor.pkl", "rb") as f:
    rf_regressor = pickle.load(f)

with open("risk_category_classifier.pkl", "rb") as f:
    rf_classifier = pickle.load(f)

with open("xgb_reg_model.pkl", "rb") as f:
    xgb_regressor = pickle.load(f)

with open("xgb_cls_model.pkl", "rb") as f:
    xgb_classifier = pickle.load(f)

mlp_reg = tf.keras.models.load_model("mlp_reg_model.keras")
mlp_cls = tf.keras.models.load_model("mlp_cls_model.keras")

# Load scaler for MLP
with open("mlp_scaler.pkl", "rb") as f:
    mlp_scaler = pickle.load(f)

# Load LabelEncoders from a single file
with open("label_encoders.pkl", "rb") as f:
    label_encoders = pickle.load(f)

# Select features for prediction (drop unnecessary columns)
features = test_df.drop(columns=["Order_Time", "Anomaly"], errors="ignore")

# Apply models
rf_reg_pred = rf_regressor.predict(features)
rf_cls_pred = rf_classifier.predict(features)

xgb_reg_pred = xgb_regressor.predict(features)
xgb_cls_pred = xgb_classifier.predict(features)

# Normalize test data for MLP
test_scaled = mlp_scaler.transform(features)

mlp_reg_pred = mlp_reg.predict(test_scaled).flatten()
mlp_cls_pred = mlp_cls.predict(test_scaled).argmax(axis=1)

# Ensemble Model (Final Risk Score = Average of Predictions)
test_df["Risk_Score_Final"] = (rf_reg_pred + xgb_reg_pred + mlp_reg_pred) / 3

# Ensemble Model (Final Risk Category = Majority Vote)
test_df["Risk_Category_Final"] = mode(
    np.column_stack([rf_cls_pred, xgb_cls_pred, mlp_cls_pred]), axis=1
)[0].flatten()

# Map numeric Risk Category to labels
risk_category_map = {0: "normal", 1: "low risk", 2: "medium risk", 3: "high risk"}
test_df["Risk_Category_Final"] = test_df["Risk_Category_Final"].map(risk_category_map)

# Drop individual model predictions and unnecessary columns
columns_to_remove = [
    "Risk_Score_RF", "Risk_Score_XGB", "Risk_Score_MLP",
    "Risk_Category_RF", "Risk_Category_XGB", "Risk_Category_MLP",
    "Avg_Transaction_Amount", "Transaction_Frequency", "Transaction_Deviation",
    "Unusual_Payment_Method", "Unusual_Device", "Time_Since_Last_Transaction", "Order_Time_Range"
]
test_df = test_df.drop(columns=[col for col in columns_to_remove if col in test_df.columns])

# Reverse the encoding for categorical columns
def reverse_encoding(df, encoders):
    for column, encoder in encoders.items():
        if column in df.columns:
            df[column] = encoder.inverse_transform(df[column])
    return df

# Reverse the encoding for all categorical columns
test_df = reverse_encoding(test_df, label_encoders)

# Save the final dataset with original categorical values restored
test_df.to_csv("final_predictions_with_original_values.csv", index=False)

print("✅ Step 4: Predictions Converted to Original Values and Saved as 'final_predictions_with_original_values.csv'")


[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
✅ Step 4: Predictions Converted to Original Values and Saved as 'final_predictions_with_original_values.csv'
