### DOWNLOADING AND WORKING WITH DIRECTORIES



In [None]:
!pip install scipy
!pip install catboost
import scipy
import catboost

In [None]:
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from scipy import stats
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Flatten, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau



In [None]:
#load dataset
df=pd.read_csv(r"D:\Projects\Minor\test\final_insect_env_merged.csv")


### EXPLORING DATASET

In [None]:
#top 5 rows --gives a peak into what the data feels like
df.head()

In [None]:
#viewing last 5 entries of the dataset
df.tail()

In [None]:
#some std parameters of the dataset
df.describe()

In [None]:
#names of columns of dataset
df.columns

## DATA PREPROCESSING

### CHECKING FOR COLUMNS WITH STRING DATATYPE

In [None]:
for col in df.columns:
  if(df[col].dtype=='object'):
    print(col)

### REMOVING COLUMNS THAT DO NOT HAVE AN IMPACT

In [None]:
# removing 'file' column as it doesn't impact the prediction
df=df.drop(['file'], axis=1)

###CHECKING FOR MISSING VALUES

In [None]:
#checking for missing values
cols_with_missing=[col for col in df.columns if df[col].isnull().any()]
print(cols_with_missing)

In [None]:
df = df.drop(columns=['Unnamed: 165'], errors='ignore')
df = df.replace([np.inf, -np.inf], np.nan)
numeric_cols = df.select_dtypes(include=np.number).columns
df = df.dropna(subset=numeric_cols).reset_index(drop=True)
zero_var_cols = [c for c in numeric_cols if df[c].std() == 0]
df = df.drop(columns=zero_var_cols)
print("Dropped zero-variance columns:", zero_var_cols)
df[numeric_cols] = df[numeric_cols].clip(lower=df[numeric_cols].quantile(0.001),
                                         upper=df[numeric_cols].quantile(0.999),
                                         axis=1)


In [None]:
#checking count of columns with missing values as well
df.isnull().sum().sum()

### CHECK FOR DUPLICATE VALUES

In [None]:
#checking for duplicate values
df.duplicated().sum()

In [None]:
#removing duplicated data
df=df.drop_duplicates().reset_index(drop=True)
df.duplicated().sum()

### CHECKING FOR OUTLIERS

In [None]:

# InterQuartile Range Check
numeric_cols=df.select_dtypes(include=np.number).columns # taking only numeric columns

outliers={}

for col in numeric_cols:
  Q1=df[col].quantile(0.25)
  Q2=df[col].quantile(0.75)
  IQR=Q2-Q1
  lower=Q1-1.5*IQR
  upper=Q2+1.5*IQR
  outliers[col]=df[(df[col]<lower)|(df[col]>upper)][col].count()

#show outliers
{k:v for k,v in outliers.items() if v>0}

In [None]:
# Z-score outlier detection
from scipy import stats
z_scores=np.abs(stats.zscore(df[numeric_cols]))
outlier_rows=np.where(z_scores>3)[0]
len(outlier_rows)


### CHECKING CLASS DISTRIBUTION


In [None]:
# Checking counts in individual categories
counts=df['label'].value_counts()
counts



In [None]:
df['label'].value_counts(dropna=False)
df[df['label'].isna()].head()


In [None]:
# visualising the distributions
counts = df['label'].value_counts().reset_index()
counts.columns = ['label', 'count']

plt.figure(figsize=(12,6))
sns.barplot(data=counts, x='label', y='count', palette='viridis')

plt.title("Original Dataset Class Distribution", fontsize=18)
plt.xlabel("Class Label", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.xticks(rotation=20, ha='right', fontsize=12)

plt.tight_layout()
plt.show()


## DATA BALANCING

In [None]:
# Seperating classes acc. to label
others_df=df[df['label']=="Others"].copy()
env_df=df[df['label']=="Env"].copy()
species_df=df[~df['label'].isin(["Others", "Env"])].copy()

### CLUSTERING BASED SAMPLING FOR 'OTHERS'

In [None]:
#keeping all numeric columns
feat_cols=[c for c in df.columns if c not in['label']]

In [None]:
# scaling numeric values for uniformity
scaler=StandardScaler()
scaler.fit(df[feat_cols])
X_others = scaler.transform(others_df[feat_cols])
X_env    = scaler.transform(env_df[feat_cols])


In [None]:
# Applying KMeans
RANDOM_STATE=42
n_clusters_others = 35
n_clusters_env    = 25
kmeans_others = MiniBatchKMeans(
    n_clusters=n_clusters_others,
    batch_size=1024,
    random_state=RANDOM_STATE
)

kmeans_env = MiniBatchKMeans(
    n_clusters=n_clusters_env,
    batch_size=1024,
    random_state=RANDOM_STATE
)
others_clusters = kmeans_others.fit_predict(X_others)
env_clusters    = kmeans_env.fit_predict(X_env)

# Assign cluster IDs (aligned indices)
others_df['cluster'] = others_clusters
env_df['cluster']    = env_clusters

In [None]:
# Sampling individual categories
def proportional_sample(df_sub, cluster_col, target_n, random_state=42):
    clusters = df_sub[cluster_col].unique()
    samples = []
    total = len(df_sub)

    for cl in clusters:
        cluster_df = df_sub[df_sub[cluster_col] == cl]
        size = len(cluster_df)

        # proportional allocation
        n = max(1, int(round(size / total * target_n)))
        # safe sampling
        selected = cluster_df.sample(
            n=min(n, size),
            replace=False,
            random_state=random_state
        )
        samples.append(selected)

    sampled = pd.concat(samples).reset_index(drop=True)

    # Adjust overshoot/undershoot
    if len(sampled) > target_n:
        sampled = sampled.sample(target_n, random_state=random_state).reset_index(drop=True)
    elif len(sampled) < target_n:
        needed = target_n - len(sampled)
        remaining = df_sub.loc[~df_sub.index.isin(sampled.index)]
        extra = remaining.sample(needed, random_state=random_state)
        sampled = pd.concat([sampled, extra]).reset_index(drop=True)
    return sampled
target_others = 3000
target_env    = 2500

others_sample = proportional_sample(others_df, 'cluster', target_others)
env_sample    = proportional_sample(env_df, 'cluster', target_env)
# Remove temporary clustering column
others_sample = others_sample.drop(columns=['cluster'], errors='ignore')
env_sample    = env_sample.drop(columns=['cluster'], errors='ignore')





In [None]:
# creating the balanced dataset that'll be used further
balanced_df = pd.concat(
    [species_df, others_sample, env_sample],
    ignore_index=True
)
balanced_df = balanced_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

### CLUSTERING BASED SAMPLING FOR 'ENV'

In [None]:
# combining everything
len(balanced_df)

### CHECKING IF ALL WORK DONE CORRECTLY

In [None]:
len(balanced_df)
balanced_df.isnull().sum().sum()



In [None]:
balanced_df['label'].value_counts()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=balanced_df, x='label', palette='viridis')
plt.title("Balanced Dataset Class Distribution")
plt.xlabel("Class Label")
plt.ylabel("Count")
plt.xticks(rotation=30, ha='right')   # rotate & align right
plt.tight_layout()
plt.show()


## DATA SCALING AND ENCODING

In [None]:
# keeping only numeric data
X=balanced_df.drop(['label'], axis=1)

# Target column
y=balanced_df['label']

# Splitting data for training and testing
X_train, X_test, y_train, y_test=train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
# Scaling data using Robust Scaler
scaler=RobustScaler()

# fit only on training data
X_train_scaled=scaler.fit_transform(X_train)

#transform test data using same scaling
X_test_scaled=scaler.transform(X_test)



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -----------------------------
# 1. Create a DataFrame for scaled features
# -----------------------------
scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

print("\n===== SUMMARY STATISTICS (SCALED DATA) =====")
display(scaled_df.describe().T)


# -----------------------------
# 2. Check for NaN / Inf values
# -----------------------------
print("\n===== NaN / Inf Check =====")
print("NaN count:", scaled_df.isna().sum().sum())
print("Inf count:", np.isinf(scaled_df).sum().sum())


# -----------------------------
# 3. Compare ONE FEATURE before & after scaling
# -----------------------------
feature = X_train.columns[0]  # choose first feature

plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.hist(X_train[feature], bins=50)
plt.title(f"Original: {feature}")

plt.subplot(1,2,2)
plt.hist(scaled_df[feature], bins=50)
plt.title(f"Scaled: {feature}")

plt.tight_layout()
plt.show()


# -----------------------------
# 4. Boxplot of first 50 scaled features
# -----------------------------
plt.figure(figsize=(14,6))
plt.boxplot(scaled_df.iloc[:, :50], vert=False)
plt.title("Scaled Feature Distribution (First 50 Features)")
plt.xlabel("Value")
plt.show()


# -----------------------------
# 5. Inspect RobustScaler params
# -----------------------------
print("\n===== RobustScaler Center (Median of Each Feature) =====")
center_series = pd.Series(scaler.center_, index=X_train.columns)
display(center_series)

print("\n===== RobustScaler Scale (IQR of Each Feature) =====")
scale_series = pd.Series(scaler.scale_, index=X_train.columns)
display(scale_series)


# -----------------------------
# 6. Compare full feature vector for one sample
# -----------------------------
idx = 0  # choose first sample

plt.figure(figsize=(16,5))
plt.plot(X_train.iloc[idx], label="Original")
plt.plot(scaled_df.iloc[idx], label="Scaled")
plt.legend()
plt.title("Full Feature Vector â€” Before vs After Scaling")
plt.show()


In [None]:
import pandas as pd
import re

# Convert center_ array to Series
center_series = pd.Series(scaler.center_, index=X_train.columns)

# Filter MFCC mean + std
mfcc_centers = center_series[center_series.index.str.contains(r"mfcc_", regex=True)]

# Filter delta means + std
delta_centers = center_series[center_series.index.str.contains(r"delta_mean_|delta_std_", regex=True)]

# Filter delta2 means + std
delta2_centers = center_series[center_series.index.str.contains(r"delta2_mean_|delta2_std_", regex=True)]

print("==== MFCC CENTERS (mean + std) ====")
display(mfcc_centers)

print("\n==== DELTA CENTERS (mean + std) ====")
display(delta_centers)

print("\n==== DELTA2 CENTERS (mean + std) ====")
display(delta2_centers)


In [None]:
# converting scaled np arrays back to dataframes
X_train_scaled=pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled=pd.DataFrame(X_test_scaled, columns=X_test.columns)


In [None]:
# encoding data for as and when needed
le=LabelEncoder()
y_train_enc=le.fit_transform(y_train)
y_test_enc=le.transform(y_test)

In [None]:
joblib.dump(scaler, "robust_scaler.pkl")
# joblib.dump(scaler, "/content/drive/MyDrive/Insectra/robust_scaler.pkl")

In [None]:
import json

# Save training column order
with open("feature_order.json", "w") as f:
    json.dump(list(X_train.columns), f)


## MODEL TRAINING AND TESTING

In [None]:
#defining and declaring all the models we'll be trying
models={
    "SVM-RBF": SVC(kernel="rbf", probability=True),
    "RandomForest": RandomForestClassifier(n_estimators=300),
    "XGBoost": XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8),
    # "LightGBM": LGBMClassifier(n_estimators=300, learning_rate=0.05),
    "LogisticRegression": LogisticRegression(max_iter=500),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "CatBoost": CatBoostClassifier(iterations=300, learning_rate=0.05, depth=8, verbose=False, random_state=42),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=400, random_state=42, n_jobs=-1),
    "HistGradientBoosting": HistGradientBoostingClassifier(learning_rate=0.05, max_depth=8, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=300, learning_rate=0.05, random_state=42)

}

In [None]:
results = {}
# storing to store individual f1 scores for each category and using each model
f1_results = []


for name, model in models.items():
    print(f"\n----------------------")
    print(f"Training {name}...")

    if(name in ['XGBoost', 'LightGBM', 'HistGradientBoosting', 'AdaBoost']):
      model.fit(X_train_scaled.values, y_train_enc)
      preds = model.predict(X_test_scaled.values)
      preds = le.inverse_transform(preds)  # convert back to strings
    else:
      model.fit(X_train_scaled, y_train)
      preds = model.predict(X_test_scaled)
      # acc = accuracy_score(y_test, preds)

    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))
    # store per-class F1 scores
    report = classification_report(y_test, preds, output_dict=True)
    f1_row = {cls: report[cls]["f1-score"]
              for cls in report.keys()
              if cls not in ["accuracy", "macro avg", "weighted avg"]}
    f1_row["model"] = name
    f1_results.append(f1_row)


    # Save results
    results[name] = acc

    # Confusion matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(confusion_matrix(y_test, preds), annot=True, fmt="d", cmap="Blues")
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()


## DEEP LEARNING MODELS

In [None]:
!pip install --upgrade pip
!pip install tensorflow==2.12.0
!pip install tensorflow-directml-plugin -f https://aka.ms/tensorflow-directml-plugin


In [None]:
# CNN reshape
X_train_np = X_train_scaled.to_numpy()
X_test_np  = X_test_scaled.to_numpy()

X_train_cnn = X_train_np.reshape(X_train_np.shape[0], X_train_np.shape[1], 1)
X_test_cnn  = X_test_np.reshape(X_test_np.shape[0], X_test_np.shape[1], 1)

# One-hot
y_enc = le.fit_transform(y)
num_classes = len(np.unique(y_enc))
y_train_cat = tf.keras.utils.to_categorical(y_train_enc, num_classes)
y_test_cat  = tf.keras.utils.to_categorical(y_test_enc, num_classes)

# Building the 1D CNN
def build_1d_cnn(input_shape, num_classes):
    model = Sequential([
        Conv1D(64, kernel_size=5, activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),

        Conv1D(128, kernel_size=5, activation='relu'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),

        Conv1D(256, kernel_size=3, activation='relu'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),

        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.4),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

input_shape = (X_train_cnn.shape[1], 1)
cnn = build_1d_cnn(input_shape, num_classes)
cnn.summary()

es = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# Training
history = cnn.fit(
    X_train_cnn, y_train_cat,
    epochs=60,
    batch_size=32,
    validation_split=0.15,
    callbacks=[es, rlr],
    verbose=1
)

#Testing
test_loss, test_acc = cnn.evaluate(X_test_cnn, y_test_cat, verbose=0)

# Predictions
preds_proba = cnn.predict(X_test_cnn)
preds = np.argmax(preds_proba, axis=1)
predicted_labels = le.inverse_transform(preds)
true_labels = le.inverse_transform(y_test_enc)

# Classification report
print("\nClassification Report:")
#cnn_report=(classification_report(true_labels, predicted_labels))
cnn_report = classification_report(true_labels, predicted_labels, output_dict=True)
print(cnn_report)
f1_row = {cls: cnn_report[cls]["f1-score"]
          for cls in cnn_report.keys()
          if cls not in ["accuracy", "macro avg", "weighted avg"]}
f1_row["model"] = "1D CNN"
f1_results.append(f1_row)

# Confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)
plt.figure(figsize=(7,5))
sns.heatmap(cm, annot=True, cmap="Greens", fmt="d")
plt.title("Confusion Matrix - 1D CNN")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
final_f1_table = pd.DataFrame(f1_results).set_index("model")
display(final_f1_table)


## FINAL MODEL SAVING


In [None]:
# saving encoder for xgboost
joblib.dump(le, "label_encoder.pkl")
# choosing xgboost as the final model as giving best results
joblib.dump(models["XGBoost"], "xgboost_model.pkl")
