In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from pathlib import Path

In [None]:
PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"

OUTPUT_DIR.mkdir(exist_ok=True)

In [None]:
data_file= DATA_DIR / "train_data.csv"
target_column="Class"
max_display=20
high_corr_threshold=0.98
nzv_ratio=0.01
nzv_dominance=0.95

In [None]:
data=pd.read_csv(data_file,low_memory=False)
num_rows,num_cols=data.shape
print(f"data loaded from {data_file}|shape= {data.shape}")

In [None]:
features=data

# feature type

In [None]:
numeric_features=features.select_dtypes(include=[np.number]).columns.tolist()
categorical_features=features.select_dtypes(include=["object","category"]).columns.tolist()
boolean_features=features.select_dtypes(include=["bool"]).columns.tolist()

In [None]:
datetime_features=[]
for col in features.columns:
    if col in numeric_features or col in categorical_features or col in boolean_features:
        continue
    try:
        _=pd.to_datetime(features[col].dropna().sample(min(1000,features[col].dropna().shape[0])),errors="raise")
        datetime_features.append(col)
    except Exception:
        pass

In [None]:
print("\n feature type counts:")
print(pd.Series({"total_features":len(features.columns),"numeric":len(numeric_features),
                 "categorical":len(categorical_features),"boolean":len(boolean_features),"datetime_like":len(datetime_features)}))

# missing values

In [None]:
missing_count=features.isna().sum()
missing_pct=(missing_count/num_rows*100).round(3)
missing_data=pd.DataFrame({"missing_count":missing_count,"missing_percent":missing_pct})
missing_data=missing_data.sort_values("missing_percent",ascending=False)

In [None]:
print("\nfeatures with missing values:")
print(missing_data[missing_data["missing_count"]>0].head(max_display))

# near-zero-variance features, Constant features, ID-like features

In [None]:
def is_nzv(column:pd.Series)->bool:
    total=column.size
    if total==0:
        return False
    unique_vals=column.dropna().unique()
    unique_count=len(unique_vals)
    if unique_count==0:
        return False
    unique_ratio=unique_count/total
    counts=column.value_counts(dropna=True)
    dominant_value_ratio=counts.iloc[0]/total if len(counts) else 0
    return (unique_ratio<nzv_ratio) and (dominant_value_ratio>nzv_dominance)

In [None]:
constant_features=[col for col in features.columns if features[col].nunique(dropna=False)==1]
nzv_features=[]
for col in features.columns:
    try:
        if is_nzv(features[col]):
            nzv_features.append(col)
    except Exception:
        pass

In [None]:
print(f"\nConstant features (n={len(constant_features)}): {constant_features[:max_display]}")
print(f"near-zero-variance features (n={len(nzv_features)}): {nzv_features[:max_display]}")

In [None]:
id_like_features=[]
for col in features.columns:
    num_unique=features[col].nunique(dropna=False)
    if num_unique/num_rows>=0.999:
        id_like_features.append(col)

In [None]:
print(f"\npotential ID-like features (n={len(id_like_features)}): {id_like_features[:max_display]}")

# duplicate rows

In [None]:
duplicate_rows_count=data.duplicated().sum()
print(f"\nduplicate rows found: {duplicate_rows_count}")

In [None]:
duplicate_columns=[]
column_map={}
for col in features.columns:
    key=pd.util.hash_pandas_object(features[col],index=False).values
    key_bytes=key.tobytes()
    if key_bytes in column_map:
        duplicate_columns.append((column_map[key_bytes],col))
    else:
        column_map[key_bytes]=col

In [None]:
print(f"duplicate column pairs: {duplicate_columns[:max_display]}")
#pd.DataFrame(duplicate_columns,columns=["col_a","col_b"]).to_csv("duplicate_columns.csv",index=False)

# high-correlation feature pairs

In [None]:
high_corr_pairs=[]
if len(numeric_features)>=2:
    numeric_data=features[numeric_features].astype("float32")
    corr_matrix=numeric_data.corr(method="pearson")
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1,len(corr_matrix.columns)):
            corr_value=corr_matrix.iat[i,j]
            if pd.notna(corr_value) and abs(corr_value)>=high_corr_threshold:
                high_corr_pairs.append((corr_matrix.columns[i],corr_matrix.columns[j],float(corr_value)))
high_corr_pairs.sort(key=lambda x:abs(x[2]),reverse=True)           

In [None]:
print(f"\nhigh-correlation feature pairs |r|>={high_corr_threshold} (showing up to {max_display}):")
print(high_corr_pairs[:max_display])