In [3]:
import pandas as pd

In [4]:
pip install pandas openpyxl

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Load the dataset
file_path = r"D:\Study Materials\Project\AnomaData_Capstone_Project\data\AnomaData.xlsx"
df = pd.read_excel(file_path)

In [None]:
# Display basic information and first few rows
df.info(), df.head()

In [None]:
# Step 1: Data Cleaning & Preprocessing

# Drop duplicate rows if any
df = df.drop_duplicates()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()

In [None]:
# Drop any columns with excessive missing values (threshold: 30% missing)
threshold = 0.3 * len(df)
df = df.dropna(thresh=threshold, axis=1)

In [None]:
# Fill remaining missing values with column median
df = df.fillna(df.median())

In [None]:
# Convert timestamp column to datetime format
df['time'] = pd.to_datetime(df['time'])

In [None]:
# Drop duplicate target column if it exists ('y.1' seems like a duplicate of 'y')
if 'y.1' in df.columns:
    df = df.drop(columns=['y.1'])

In [None]:
# Final dataset shape after preprocessing
df.shape, missing_values.sum()

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
# Set plot style
sns.set_style("whitegrid")

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=df['y'], hue=df['y'], palette="viridis", legend=False)  # Assign 'y' to hue
plt.title("Distribution of Anomalies (Target Variable)")
plt.xlabel("Anomaly (1 = Yes, 0 = No)")
plt.ylabel("Count")
plt.show()

In [None]:
# Correlation heatmap (Top 10 correlated features with 'y')
plt.figure(figsize=(10, 6))
corr_matrix = df.corr()
top_corr_features = corr_matrix['y'].abs().sort_values(ascending=False).head(11).index
sns.heatmap(df[top_corr_features].corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap (Top Features Related to Anomaly)")
plt.show()

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Step 1: Prepare Data
X = df.drop(columns=['y', 'time'])  # Features (exclude target & timestamp)
y = df['y']  # Target variable

In [None]:
# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Step 3: Train Isolation Forest Model
model = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
model.fit(X_train)

In [None]:
# Step 4: Predict Anomalies
y_pred = model.predict(X_test)

In [None]:
# Convert Isolation Forest output (-1 for anomaly, 1 for normal) to match target labels (1 for anomaly, 0 for normal)
y_pred = [1 if pred == -1 else 0 for pred in y_pred]

In [None]:
# Step 5: Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
accuracy, report

In [None]:
from flask import Flask, request, jsonify
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest

In [None]:
# Initialize Flask app
app = Flask(__name__)

# Load trained model (For now, retrain within this script, but should load from a file)
model = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)

In [None]:
def train_model():
    # Load dataset
    file_path = "AnomaData.xlsx"
    df = pd.read_excel(file_path)
    df = df.drop(columns=['y', 'time'])  # Drop target and timestamp
    model.fit(df)
    with open("model.pkl", "wb") as f:
        pickle.dump(model, f)

In [None]:
# Train model if not already trained
try:
    with open("model.pkl", "rb") as f:
        model = pickle.load(f)
except FileNotFoundError:
    train_model()

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json()
        features = np.array(data['features']).reshape(1, -1)  # Convert input to numpy array
        prediction = model.predict(features)
        result = "Anomaly" if prediction[0] == -1 else "Normal"
        return jsonify({"prediction": result})
    except Exception as e:
        return jsonify({"error": str(e)})

if __name__ == '__main__':
    app.run(debug=True)