In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.decomposition import PCA
from pathlib import Path

In [None]:
PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"

OUTPUT_DIR.mkdir(exist_ok=True)

# constant/nzv/id-like columns

In [None]:
data_file= DATA_DIR / "train_data.csv"
target_column="Class"
data=pd.read_csv(data_file,low_memory=False)
target=data[target_column]
features=data.drop(columns=[target_column])

In [None]:
#feature names
features=data.drop(columns=[target_column])
feature_names=features.columns.tolist()

print(f"total features: {len(feature_names)}")

feature_df = pd.DataFrame({"Feature": feature_names, "Count": [1]*len(feature_names)})

plt.figure(figsize=(10, 20))  
sns.barplot(data=feature_df,x="Count",y="Feature",palette="crest")
plt.title("all features in train_data.csv")
plt.xlabel("count (each=1 feature)")
plt.ylabel("feature name")
plt.tight_layout()
plt.show()

In [None]:
print(f"Shape of original dataset: {features.shape}")

In [None]:
constant_features=[col for col in features.columns if features[col].nunique() == 1]

In [None]:
nzv_features=[]
for col in features.columns:
    counts=features[col].value_counts(normalize=True, dropna=False)
    if len(counts)>1 and counts.iloc[0]>0.95 and features[col].nunique()<10:
        nzv_features.append(col)

In [None]:
id_like_features=[col for col in features.columns if features[col].nunique()==len(features)]

In [None]:
to_drop_basic =constant_features+nzv_features+id_like_features
print(f"Constant features: {len(constant_features)}")
print(f"Near-zero-variance features: {len(nzv_features)}")
print(f"ID-like features: {len(id_like_features)}")
print(f"Total features removed at this step: {len(to_drop_basic)}")

In [None]:
features=features.drop(columns=to_drop_basic)
print(f"Shape after removing constant/nzv/id-like columns: {features.shape}")

In [None]:
features[target_column]=target
features.to_csv(OUTPUT_DIR / "train_data_no_con_nzv_id_clean.csv", index=False)

In [None]:
print(f"Shape after removing constant/nzv/id-like columns: {features.shape}")

# Handling Missing Values

In [None]:
data=pd.read_csv(r"C:\Users\rupa1\Downloads\Jupitar_python\Data Mining\project 2\train_data_no_con_nzv_id_clean.csv")
target_column="Class"
target=data[target_column]
features=data.drop(columns=[target_column])

In [None]:
missing_before=features.isna().sum()
missing_pct_before=(missing_before/len(features)*100).round(3)
missing_summary_before=pd.DataFrame({"missing_count": missing_before,
    "missing_percent": missing_pct_before}).sort_values("missing_percent", ascending=False)
#missing_summary_before.to_csv(OUTPUT_DIR / "missing_summary_before.csv")

print("\n missing value summary:")
print(missing_summary_before.head(10))

In [None]:
# numeric & categorical features
numeric_cols=features.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols=features.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Numeric features:{len(numeric_cols)},Categorical features:{len(categorical_cols)}")

In [None]:
#for numeric column= with median
for col in numeric_cols:
    if features[col].isna().sum()>0:
        median_value = features[col].median()
        features[col] =features[col].fillna(median_value)

#for categorical columns=with most frequent/mode
for col in categorical_cols:
    if features[col].isna().sum()>0:
        mode_value= features[col].mode()[0]
        features[col]= features[col].fillna(mode_value)

In [None]:
#missingness (after)
missing_after = features.isna().sum()
missing_pct_after = (missing_after / len(features) * 100).round(3)
missing_summary_after = pd.DataFrame({"missing_count": missing_after,
    "missing_percent": missing_pct_after}).sort_values("missing_percent", ascending=False)
#missing_summary_after.to_csv(OUTPUT_DIR / "missing_summary_after.csv")

In [None]:
print(missing_summary_after)

In [None]:
#before vs after
compare_missing =pd.concat([missing_summary_before, missing_summary_after],
    axis=1,keys=["Before", "After"])
compare_missing.columns=["before_count", "before_percent", "after_count", "after_percent"]
compare_missing["reduction_%"]= ((compare_missing["before_percent"] - compare_missing["after_percent"]).round(3))
#compare_missing.to_csv(OUTPUT_DIR / "missing_comparison.csv")

In [None]:
print("\nMissing value reduction summary:")
print(compare_missing.head(10))

In [None]:
features_imputed=features.copy()
features_imputed[target_column]=target
print(f"Shape of dataset after removing missing values: {features_imputed.shape}")

In [None]:
features_imputed.to_csv(OUTPUT_DIR / "train_data_no_missing_value.csv", index=False)

print("Saved train_data_no_missing_value.csv")

# Outlier handling and scaling

In [None]:
data=pd.read_csv(r"C:\Users\rupa1\Downloads\Jupitar_python\Data Mining\project 2\train_data_no_missing_value.csv")
target="Class"
x=data.drop(columns=[target])
y=data[target]
x.columns=x.columns.str.strip()

In [None]:
#print(list(x.columns)) 

In [None]:
def find_outliers(s):
    q1=s.quantile(0.25)
    q3=s.quantile(0.75)
    iqr=q3-q1
    lower=q1-1.5*iqr
    upper=q3+1.5*iqr
    outliers=((s<lower)|(s>upper)).sum()
    return outliers

In [None]:
out=pd.DataFrame({"feature":x.select_dtypes(include=[np.number]).columns,
    "outlier_count":[find_outliers(x[c]) for c in x.select_dtypes(include=[np.number]).columns]})

out["outlier_percent"]=(out["outlier_count"]/len(x)*100).round(2)
#out.to_csv(OUTPUT_DIR / "outlier_summary.csv",index=False)
print(out.sort_values("outlier_percent",ascending=False).head(10))

In [None]:
feature_name = "Fwd Header Length"
if feature_name not in x.columns:
    import difflib
    print("Feature not found. Possible matches:")
    print(difflib.get_close_matches(feature_name, x.columns, n=5))
else:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=x[feature_name], color="skyblue")
    plt.title("boxplot of 'Fwd Header Length' before scaling")
    plt.xlabel("Fwd Header Length")
    plt.tight_layout()
    plt.show()

In [None]:
scaler=RobustScaler()
num_cols=x.select_dtypes(include=[np.number]).columns
x_scaled=x.copy()
x_scaled[num_cols]=scaler.fit_transform(x[num_cols])
x_scaled[target]=y
x_scaled.to_csv(OUTPUT_DIR / "train_data_scaled.csv",index=False)

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x=x_scaled["Fwd Header Length"])
plt.title("boxplot of 'Fwd Header Length' after robust scaling")
plt.tight_layout()
plt.show()

# correlation and redundancy removal

In [None]:
data_scaled=pd.read_csv(OUTPUT_DIR / "train_data_scaled.csv")
target_column="Class"
X=data_scaled.drop(columns=[target_column])
y=data_scaled[target_column]

numeric_cols=X.select_dtypes(include=[np.number]).columns
X_num=X[numeric_cols]

In [None]:
#correlation matrix
corr_matrix=X_num.corr(method="pearson").abs()

#find highly correlated pairs
high_corr_threshold=0.98
high_corr_pairs=[]

for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i,j]>=high_corr_threshold:
            col_a=corr_matrix.columns[i]
            col_b=corr_matrix.columns[j]
            high_corr_pairs.append((col_a,col_b,corr_matrix.iloc[i,j]))

high_corr_df=pd.DataFrame(high_corr_pairs,columns=["feature_a","feature_b","corr_value"])
print("\nHighly correlated feature pairs(|r|>=0.98):")
print(high_corr_df.head(10))

In [None]:
#removing one feature from each correlated pair
to_drop=set()
for a,b,r in high_corr_pairs:
    if b not in to_drop:
        to_drop.add(b)

print(f"\nnumber of highly correlated features removed: {len(to_drop)}")
print("Removed features:",list(to_drop)[:15])

In [None]:
X_reduced=X_num.drop(columns=list(to_drop))
X_reduced[target_column]=y
X_reduced.to_csv("train_data_reduced.csv",index=False)

plt.figure(figsize=(10,8))
sns.heatmap(
    corr_matrix,
    cmap="coolwarm",
    center=0,
    square=True,
    cbar_kws={"shrink":0.7},)
plt.title("correlation heatmap of numeric features")
plt.tight_layout()
plt.show()

print("Correlation and redundancy removal complete.")
print("Saved 'train_data_reduced.csv' for feature selection and modeling.")

In [None]:
print(f"Shape of dataset after removing missing values: {X_reduced.shape}")

# Dimensionality reduction, step 2: feature selection or extraction

In [None]:
data_reduced=pd.read_csv(OUTPUT_DIR / "train_data_reduced.csv")
target_column="Class"
X=data_reduced.drop(columns=[target_column])
y=data_reduced[target_column]

In [None]:
#feature selection using mutual information(MI)
mi_scores=mutual_info_classif(X,y,random_state=42)
mi_df=pd.DataFrame({"feature":X.columns,"mutual_info":mi_scores}).sort_values("mutual_info",ascending=False)

In [None]:
k=25
selector=SelectKBest(mutual_info_classif,k=k)
X_selected=selector.fit_transform(X,y)
selected_features=X.columns[selector.get_support()].tolist()

In [None]:
print(f"selected top {k} features based on mutual information:")
print(selected_features)

In [None]:
pd.Series(selected_features) \
    .to_csv(OUTPUT_DIR / "selected_features.csv", index=False)

In [None]:
X_selected_df=pd.DataFrame(X_selected,columns=selected_features)
X_selected_df[target_column]=y

In [None]:
X_selected_df.to_csv(OUTPUT_DIR / "final_train_data_selected.csv",index=False)

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(data=mi_df.head(20),x="mutual_info",y="feature",palette="crest")
plt.title("top 20 features by mutual information with target")
plt.xlabel("mutual information score")
plt.ylabel("feature name")
plt.tight_layout()
plt.show()

In [None]:
pca=PCA(n_components=2)
X_pca=pca.fit_transform(X)
plt.figure(figsize=(6,5))
plt.scatter(X_pca[:,0],X_pca[:,1],c=(y=="Trojan"),cmap="coolwarm",s=5)
plt.title("pca projection of data (2 components)")
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.tight_layout()
plt.show()

In [None]:
print("Dimensionality reduction step 2 complete.'train_data_selected.csv' is saved")