# **Anomaly Detection Using Sensor Reading**

## **Import Libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn utilities
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score

# Classical Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Advanced Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load sample_submission only to ensure correct output format
#sample_submission = pd.read_parquet('/kaggle/input/ana-verse-2-0-h/sample_submission.parquet')
#sample_submission

# **1Ô∏è‚É£ Data Loading & Initial Exploration**

In [None]:
train = pd.read_parquet('/kaggle/input/ana-verse-2-0-h/train.parquet')
test = pd.read_parquet('/kaggle/input/ana-verse-2-0-h/test.parquet')

### Train Data

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.shape

In [None]:
train.describe()

### Test Data

In [None]:
test.head()

In [None]:
test.info()

In [None]:
test.shape

In [None]:
# Convert target to numeric
train['target'] = train['target'].astype(int)

In [None]:
train.info()

# **2Ô∏è‚É£ Exploratory Data Analysis**

### Missing Value Check

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

### Duplicate Check

In [None]:
# Check for duplicate rows
duplicate_count = train.duplicated().sum()
duplicate_pct = duplicate_count/len(train)*100

display(pd.DataFrame({
    'Metric': ['Total Rows', 'Duplicate Rows', 'Percentage'],
    'Value': [
        f'{len(train):,}',
        f'{duplicate_count:,}',
        f'{duplicate_pct:.2f}%'
    ]
}))

> **Keep duplicates as they represent valid repeated sensor readings in time-series data**

In [None]:
## Remove exact duplicates (keep first occurrence)
#train_clean = train.drop_duplicates(keep='first').reset_index(drop=True)
#print(f"Rows removed: {len(train) - len(train_clean):,}")
#print(f"Remaining rows: {len(train_clean):,}")

### Target Distribution

In [None]:
target_counts = train["target"].value_counts()
target_counts

In [None]:
target_ratio = train["target"].value_counts(normalize=True)
target_ratio

In [None]:
Class_balance = train['target'].value_counts(normalize=True)

### Target Distribution Analysis Using Plots

In [None]:
plt.figure(figsize=(10, 6))

target_counts = train['target'].value_counts().sort_index()
bars = plt.bar(['Normal (0)', 'Anomaly (1)'], target_counts.values, 
               color=['#2ecc71', '#e74c3c'], alpha=0.7, edgecolor='black')

plt.title('Target Distribution - Highly Imbalanced', fontweight='bold', fontsize=12)
plt.ylabel('Count', fontsize=11)

# Add value labels on bars
for i, v in enumerate(target_counts.values):
    plt.text(i, v + 20000, f'{v:,}\n({v/len(train)*100:.2f}%)', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

### Feature Distributions by Target

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
features = ['X1', 'X2', 'X3', 'X4', 'X5']
for idx, col in enumerate(features):
    ax = axes[idx//3, idx%3]
    train[train['target']==0][col].hist(bins=50, alpha=0.5, label='Normal (0)', ax=ax)
    train[train['target']==1][col].hist(bins=50, alpha=0.5, label='Anomaly (1)', ax=ax)
    ax.set_title(f'{col} by Target')
    ax.set_xlabel(col)
    ax.legend()
plt.tight_layout()
plt.show()

> * **This is a highly imbalanced dataset: 99.14% Normal vs 0.86% Anomaly**
> * **No missing values are found in either dataset**

### Outlier Analysis: X3 and X4

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# X3 Distribution
axes[0, 0].hist(train['X3'], bins=100, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_yscale('log')
axes[0, 0].set_title('X3 Distribution (Log Scale - Extreme Outliers Present)', fontweight='bold', fontsize=11)
axes[0, 0].set_xlabel('X3 Values')
axes[0, 0].set_ylabel('Frequency (Log Scale)')

# X3 by Target
axes[0, 1].boxplot([train[train['target']==0]['X3'], train[train['target']==1]['X3']], 
                    labels=['Normal', 'Anomaly'])
axes[0, 1].set_yscale('log')
axes[0, 1].set_title('X3: Normal vs Anomaly', fontweight='bold', fontsize=11)
axes[0, 1].set_ylabel('X3 Values (Log Scale)')

# X4 Distribution
axes[1, 0].hist(train['X4'], bins=100, color='coral', edgecolor='black', alpha=0.7)
axes[1, 0].set_yscale('log')
axes[1, 0].set_title('X4 Distribution (Log Scale - Extreme Outliers Present)', fontweight='bold', fontsize=11)
axes[1, 0].set_xlabel('X4 Values')
axes[1, 0].set_ylabel('Frequency (Log Scale)')

# X4 by Target
axes[1, 1].boxplot([train[train['target']==0]['X4'], train[train['target']==1]['X4']], 
                    labels=['Normal', 'Anomaly'])
axes[1, 1].set_yscale('log')
axes[1, 1].set_title('X4: Normal vs Anomaly', fontweight='bold', fontsize=11)
axes[1, 1].set_ylabel('X4 Values (Log Scale)')

plt.tight_layout()
plt.show()



> * **X3 and X4 have extreme outliers (values up to 10^38)**
> * **These extreme values are STRONGLY correlated with anomalies**



# **3Ô∏è‚É£ Feature Engineering**
* Here I have created eight comprehensive features for anomaly detection 

In [None]:
def create_features(df, is_train=True):
    df = df.copy()
    
    # 1. TEMPORAL FEATURES - for Date column
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    df['day'] = df['Date'].dt.day
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['weekofyear'] = df['Date'].dt.isocalendar().week
    
    # Cyclical encoding for temporal features
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # 2. OUTLIER DETECTION FEATURES - Flag extreme values (Instead of removing outliers)
    if is_train:
        global x3_cap, x4_cap
        x3_cap = df['X3'].quantile(0.99)
        x4_cap = df['X4'].quantile(0.99)
    
    df['X3_capped'] = df['X3'].clip(upper=x3_cap)
    df['X4_capped'] = df['X4'].clip(upper=x4_cap)
    
    # 3. LOG TRANSFORMATIONS - Handle extreme scales
    df['X3_log'] = np.log1p(df['X3_capped'])
    df['X4_log'] = np.log1p(df['X4_capped'])
    
    # 4. INTERACTION FEATURES - Sensor relationships
    df['X1_X2'] = df['X1'] * df['X2']
    df['X1_X5'] = df['X1'] * df['X5']
    df['X3_X4'] = df['X3_capped'] * df['X4_capped']
    df['X1_div_X2'] = df['X1'] / (df['X2'] + 1e-5)

    # 5. Ratio features
    df['X1_X2_ratio'] = df['X1'] / (df['X2'] + 1e-5)
    df['X3_X4_ratio'] = df['X3_capped'] / (df['X4_capped'] + 1e-5)
    
    # 6. STATISTICAL FEATURES - Aggregations
    df['X_sum'] = df['X1'] + df['X2'] + df['X3_capped'] + df['X4_capped'] + df['X5']
    df['X_mean'] = df[['X1', 'X2', 'X3_capped', 'X4_capped', 'X5']].mean(axis=1)
    df['X_std'] = df[['X1', 'X2', 'X3_capped', 'X4_capped', 'X5']].std(axis=1)
    df['X_min'] = df[['X1', 'X2', 'X3_capped', 'X4_capped', 'X5']].min(axis=1)
    df['X_max'] = df[['X1', 'X2', 'X3_capped', 'X4_capped', 'X5']].max(axis=1)
    
    # 7.Polynomial features for key sensors
    df['X1_squared'] = df['X1'] ** 2
    df['X5_squared'] = df['X5'] ** 2
    
    # 8.Binary flags
    df['X5_is_zero'] = (df['X5'] == 0).astype(int)
    df['X3_is_one'] = (df['X3'] == 1.0).astype(int)
    df['X4_is_one'] = (df['X4'] == 1.0).astype(int)
    
    return df

train_processed = create_features(train, is_train=True)
test_processed = create_features(test, is_train=True)

display(pd.DataFrame({
    'Dataset': ['Train (Processed)', 'Test (Processed)'],
    'Rows': [train_processed.shape[0], test_processed.shape[0]],
    'Columns': [train_processed.shape[1], test_processed.shape[1]],
    'New Features': [train_processed.shape[1] - train.shape[1], test_processed.shape[1] - test.shape[1]]
}))

> **At first, both datasets had 7 columns; now, 29 features have been created. Now in total, both datasets have 36 columns**

In [None]:
train_processed.shape

In [None]:
test_processed.shape

# **4Ô∏è‚É£ Correlation Analysis**

In [None]:
correlation_matrix = train[['X1', 'X2', 'X3', 'X4', 'X5', 'target']].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
print("\nCorrelation with target:")
print(correlation_matrix['target'].sort_values(ascending=False))

# **5Ô∏è‚É£ Data Preparation for Modeling**

In [None]:
drop_cols = ['Date', 'target'] if 'target' in train_processed.columns else ['Date']
X = train_processed.drop(columns=drop_cols)
y = train_processed['target']

In [None]:
test_ids = test_processed['ID']
X_test = test_processed.drop(columns=['Date', 'ID'])
X_test = X_test[X.columns]

In [None]:
# Train‚ÄìValidation split
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

## Data Preprocessing

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# **6Ô∏è‚É£ Model Traning and Evaluation**

In [None]:
def evaluate_model(model, model_name, X_val, y_val):
    y_pred = model.predict(X_val)
    
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_val)[:, 1]
    else:
        y_pred_proba = None

    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_val, y_pred),
        'Precision': precision_score(y_val, y_pred, zero_division=0),
        'Recall': recall_score(y_val, y_pred),
        'F1 Score': f1_score(y_val, y_pred),
        'ROC-AUC': roc_auc_score(y_val, y_pred_proba) if y_pred_proba is not None else None
    }
    
    return metrics, y_pred

    # Confusion Matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
    plt.title(f'Confusion Matrix - {model_name}', fontweight='bold', fontsize=12)
    plt.ylabel('True Label', fontsize=11)
    plt.xlabel('Predicted Label', fontsize=11)
    plt.tight_layout()
    plt.show()

    
results = []

# **7Ô∏è‚É£ Classical Models**

## **Model 1: Logistic Regression**
> * Fast baseline, interpretable coefficients, handles linear relationships
> * Use class_weight='balanced' to handle 116:1 imbalance
> * **Expected Performance**: Good, baseline

In [None]:
lr_model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    random_state=42
)

In [None]:
lr_model.fit(X_train_scaled, y_train)

In [None]:
lr_metrics, lr_pred = evaluate_model(lr_model, 'Logistic Regression', X_val_scaled, y_val)
results.append(lr_metrics)

In [None]:
display(pd.DataFrame([lr_metrics]))
plot_confusion_matrix(y_val, lr_pred, 'Logistic Regression')

## **Model 2: Decision Tree**

> * Handles non-linear relationships, no scaling needed, interpretable
> * **Expected Performance**: little Good than Logistic Regression

In [None]:
dt_model = DecisionTreeClassifier(
    random_state=42,
    max_depth=10,
    min_samples_split=100,
    min_samples_leaf=50,
    class_weight='balanced'
)

In [None]:
dt_model.fit(X_train, y_train)

In [None]:
dt_metrics, dt_pred = evaluate_model(dt_model, 'Decision Tree', X_val, y_val)
results.append(dt_metrics)

In [None]:
display(pd.DataFrame([dt_metrics]))
plot_confusion_matrix(y_val, dt_pred, 'Decision Tree')

## **Model 3: K-Nearest Neighbors**

> * Non-parametric, captures local patterns, good for anomaly detection
> * Small k value, distance-based, so requires scaling
> * KNN is computationally expensive on 1.6M rows



In [None]:
sample_size = 50000
sample_indices = np.random.choice(X_train_scaled.shape[0], sample_size, replace=False)
X_train_sample = X_train_scaled[sample_indices]
y_train_sample = y_train.iloc[sample_indices]

In [None]:
knn_model = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',
    n_jobs=-1
)

In [None]:
knn_model.fit(X_train_sample, y_train_sample)
knn_metrics, knn_pred = evaluate_model(knn_model, 'KNN', X_val_scaled, y_val)
results.append(knn_metrics)

In [None]:
display(pd.DataFrame([knn_metrics]))
plot_confusion_matrix(y_val, knn_pred, 'KNN')

# **8Ô∏è‚É£ Advanced Models**

## **Model 4: Random Forest**

> * An ensemble of trees, robust to outliers, handles imbalance well
> * Multiple estimators for stability, class_weight for imbalance
> *  **Expected Performance**: Strong baseline, often top performer for tabular data

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

In [None]:
rf_model.fit(X_train, y_train)
rf_metrics, rf_pred = evaluate_model(rf_model, 'Random Forest', X_val, y_val)
results.append(rf_metrics)

In [None]:
display(pd.DataFrame([rf_metrics]))
plot_confusion_matrix(y_val, rf_pred, 'Random Forest')

In [None]:
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False).head(15)

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], 
         color='#3498db', edgecolor='black', alpha=0.7)
plt.xlabel('Importance', fontsize=11)
plt.title('Top 15 Features - Random Forest', fontweight='bold', fontsize=12)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## **Model 5: XGBoost**

> * Industry standard for tabular data, handles imbalance excellently
> * Use scale_pos_weight (ratio of negative to positive = 116)
> * **Expected Performance**: Top tier, likely best F1 score

In [None]:
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

In [None]:
XGBoost_model= XGBClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    eval_metric='logloss'
    )

In [None]:
XGBoost_model.fit(X_train, y_train)
xgb_metrics, xgb_pred = evaluate_model(XGBoost_model, 'XGBoost', X_val, y_val)
results.append(xgb_metrics)

In [None]:
display(pd.DataFrame([xgb_metrics]))
plot_confusion_matrix(y_val, xgb_pred, 'XGBoost')

## **Model 6: LightGBM**

> * Faster than XGBoost, excellent with large datasets (1.6M rows)
> * **Expected Performance**: Similar or better than XGBoost, much faster training

In [None]:
lgbm_model = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=7,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    verbose=-1
)

In [None]:
lgbm_model.fit(X_train, y_train)
lgbm_metrics, lgbm_pred = evaluate_model(lgbm_model, 'LightGBM', X_val, y_val)
results.append(lgbm_metrics)

In [None]:
display(pd.DataFrame([lgbm_metrics]))
plot_confusion_matrix(y_val, lgbm_pred, 'LightGBM')

## **Model 7: CatBoost**
* Robust to overfitting, minimal hyperparameter tuning needed
* Competitive with XGBoost/LightGBM

In [None]:
catboost_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=7,
    l2_leaf_reg=3,
    subsample=0.8,
    random_state=42,
    verbose=0
)

In [None]:
catboost_model.fit(X_train, y_train)
catboost_metrics, catboost_pred = evaluate_model(catboost_model, 'CatBoost', X_val, y_val)
results.append(catboost_metrics)

In [None]:
display(pd.DataFrame([catboost_metrics]))
plot_confusion_matrix(y_val, catboost_pred, 'CatBoost')

# **9Ô∏è‚É£ Model Evaluation & Comparison**

## Performance Summary

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('F1 Score', ascending=False)

display(results_df.style.highlight_max(subset=['F1 Score'], color='lightgreen'))

## Model Comparison

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# F1 Score comparison
results_df.plot(x='Model', y='F1 Score', kind='barh', ax=axes[0], 
                color='#e74c3c', edgecolor='black', alpha=0.7, legend=False)
axes[0].set_xlabel('F1 Score', fontsize=11)
axes[0].set_title('Model Performance - F1 Score (Primary Metric)', fontweight='bold', fontsize=12)
axes[0].grid(axis='x', alpha=0.3)

# All metrics comparison
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
results_df.set_index('Model')[metrics_to_plot].plot(kind='bar', ax=axes[1], 
                                                      edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Model', fontsize=11)
axes[1].set_ylabel('Score', fontsize=11)
axes[1].set_title('Comprehensive Model Comparison', fontweight='bold', fontsize=12)
axes[1].legend(loc='lower right')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Model Selection

In [None]:
# Select best model based on F1 Score
best_model_name = results_df.iloc[0]['Model']
best_f1_score = results_df.iloc[0]['F1 Score']

model_mapping = {
    'Logistic Regression': lr_model,
    'Decision Tree': dt_model,
    'KNN': knn_model,
    'Random Forest': rf_model,
    'XGBoost': XGBoost_model,
    'LightGBM': lgbm_model,
    'CatBoost': catboost_model
}

best_model = model_mapping[best_model_name]

display(pd.DataFrame({
    'Metric': ['Best Model', 'Validation F1 Score', 'Status'],
    'Value': [best_model_name, f'{best_f1_score:.4f}', 'Selected ‚úì']
}))

# **üîü Final Submission**

In [None]:
submission = pd.DataFrame({
    'ID': test_processed['ID'],
    'target': final_predictions
})

In [None]:
submission.head(20)

In [None]:
display(pd.DataFrame({
    'Metric': ['Total Predictions', 'Normal (0)', 'Anomaly (1)', 'Anomaly Rate'],
    'Value': [
        len(submission),
        (submission['target']==0).sum(),
        (submission['target']==1).sum(),
        f"{(submission['target']==1).sum()/len(submission)*100:.2f}%"
    ]
}))

In [None]:
submission.shape

In [None]:
# ## Save Submission

submission.to_parquet('/kaggle/working/submission.parquet', index=False)

print("Submission file created successfully!")
