In [None]:
# Anomaly Detection from Sensor Data - Celebal AnaVerse_B

# Importing essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:


# Machine learning tools and metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import lightgbm as lgb


In [None]:

# Deep learning library
from keras.models import Sequential
from keras.layers import Dense

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")


In [None]:

# === Load Full Dataset ===
# Function to reduce memory usage by downcasting numerical columns
def reduce_memory(df):
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

print("🔄 Loading datasets...")
# Load training and test datasets
train_df = pd.read_csv("/content/drive/MyDrive/new/train.csv", low_memory=False)
test_df = pd.read_csv("/content/drive/MyDrive/new/test.csv", low_memory=False)
sample_submission = pd.read_csv("/content/drive/MyDrive/new/sample-submission.csv")


In [None]:

# Apply memory optimization
train_df = reduce_memory(train_df)
test_df = reduce_memory(test_df)
print("✅ Data loaded.")

# === Preprocessing ===
# Drop columns that are completely empty
train_df.dropna(axis=1, how='all', inplace=True)

# Fill missing values with column means
train_df.fillna(train_df.mean(numeric_only=True), inplace=True)
test_df.fillna(test_df.mean(numeric_only=True), inplace=True)

# Identify and drop datetime columns
datetime_cols = []
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        try:
            pd.to_datetime(train_df[col])
            datetime_cols.append(col)
        except:
            continue
train_df.drop(columns=datetime_cols, inplace=True, errors='ignore')
test_df.drop(columns=datetime_cols, inplace=True, errors='ignore')


In [None]:


# === Feature Engineering ===
# Visualizing correlations between features using a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(train_df.corr(numeric_only=True), cmap='coolwarm', annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:

# Keep only the columns common between train and test (excluding target)
common_columns = [col for col in train_df.columns if col in test_df.columns and col != "target"]
X = train_df[common_columns]
y = train_df["target"]
X_test = test_df[common_columns]


In [None]:

# === Scaling ===
# Standardizing the feature values
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)


In [None]:

# === Train/Validation Split ===
# Splitting data for validation purposes
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# === Logistic Regression ===
log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_val)
print("\n📊 Logistic Regression:")
print(classification_report(y_val, log_pred))


In [None]:

# === Support Vector Machine ===
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_val)
print("\n📊 Support Vector Machine:")
print(classification_report(y_val, svm_pred))


In [None]:

# === Random Forest ===
# Using random forest classifier for robust ensemble learning
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_val)
print("\n📊 Random Forest:")
print(classification_report(y_val, rf_pred))


In [None]:

# === LightGBM ===
# Gradient boosting model tuned with learning rate
lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05, random_state=42, n_jobs=-1)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_val)
print("\n📊 LightGBM:")
print(classification_report(y_val, lgb_pred))


In [None]:

# === Neural Network ===
# Basic MLP neural network with 2 hidden layers
nn_model = Sequential()
nn_model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))


In [None]:

# Compile and fit the model
nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)


In [None]:

# Predict on validation set
nn_pred = nn_model.predict(X_val).ravel()
nn_pred_class = (nn_pred > 0.5).astype(int)
print("\n📊 Neural Network:")
print(classification_report(y_val, nn_pred_class))


In [None]:

# === Final Model for Submission ===
# Retrain LightGBM on full data and predict test labels
print("\n📈 Retraining on full dataset with LightGBM...")
lgb_model.fit(X, y)
test_preds = lgb_model.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'ID': sample_submission['ID'],
    'target': test_preds
})
submission.to_csv("my_submission.csv", index=False)
print("\n✅ Final submission saved as 'my_submission.csv' with", len(submission), "rows.")
