In [6]:
from pathlib import Path
import pandas as pd

project_root = Path(r"C:\Users\HP\Desktop\projects 2025\Binary Classification with a Bank Dataset")
data_dir = project_root / "data"

train = pd.read_csv(data_dir / "train_processed.csv")
test = pd.read_csv(data_dir / "test_processed.csv")
sample = pd.read_csv(data_dir / "sample_submission.csv")

print("Using:", data_dir)
print({"train": train.shape, "test": test.shape})

Using: C:\Users\HP\Desktop\projects 2025\Binary Classification with a Bank Dataset\data
{'train': (750000, 18), 'test': (250000, 17)}


In [7]:
print(train.info())
missing = train.isnull().sum().sort_values(ascending=False)
print("Top missing:\n", missing.head(15))
if 'y' in train.columns:
    print("Target balance:", train['y'].value_counts(normalize=True).round(4).to_dict())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id         750000 non-null  float64
 1   age        750000 non-null  float64
 2   job        750000 non-null  float64
 3   marital    750000 non-null  float64
 4   education  750000 non-null  float64
 5   default    750000 non-null  float64
 6   balance    750000 non-null  float64
 7   housing    750000 non-null  float64
 8   loan       750000 non-null  float64
 9   contact    750000 non-null  float64
 10  day        750000 non-null  float64
 11  month      750000 non-null  float64
 12  duration   750000 non-null  float64
 13  campaign   750000 non-null  float64
 14  pdays      750000 non-null  float64
 15  previous   750000 non-null  float64
 16  poutcome   750000 non-null  float64
 17  y          750000 non-null  int64  
dtypes: float64(17), int64(1)
memory usage: 103.0 MB
None
Top missing

In [8]:
import numpy as np
num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = train.select_dtypes(exclude=[np.number]).columns.tolist()
print("Numeric:", len(num_cols), "Categorical:", len(cat_cols))
print("Only in train:", sorted(set(train.columns)-set(test.columns)))
print("Only in test:", sorted(set(test.columns)-set(train.columns)))

Numeric: 18 Categorical: 0
Only in train: ['y']
Only in test: []


In [9]:
skew = train[num_cols].drop(columns=['y'], errors='ignore').skew(numeric_only=True).sort_values(ascending=False)
print("Most skewed (top 10):\n", skew.head(10))

corr = train[num_cols].corr(numeric_only=True).abs()
pairs = corr.where(np.triu(np.ones(corr.shape), 1)).stack().sort_values(ascending=False)
print("Highly correlated pairs (>0.9):\n", pairs[pairs > 0.9].head(15))

Most skewed (top 10):
 default      7.442308
loan         2.075377
duration     1.223740
campaign     1.188024
balance      0.945033
contact      0.722495
age          0.498202
job          0.273263
education    0.131906
day          0.054014
dtype: float64


ValueError: Boolean array expected for the condition, not float64

In [10]:
corr = train[num_cols].corr(numeric_only=True).abs()
mask = np.triu(np.ones_like(corr, dtype=bool), k=1)
pairs = corr.where(mask).stack().sort_values(ascending=False)
print("Highly correlated pairs (>0.9):")
print(pairs[pairs > 0.9].head(15))

Highly correlated pairs (>0.9):
Series([], dtype: float64)


In [11]:
corr = train[num_cols].corr(numeric_only=True).abs()
ix = np.triu_indices_from(corr, k=1)
pairs = pd.Series(corr.values[ix], index=[(corr.index[i], corr.columns[j]) for i, j in zip(*ix)])
print(pairs[pairs > 0.9].sort_values(ascending=False).head(15))

Series([], dtype: float64)


In [12]:
import numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score

# Load the data you trained on (same source you passed to cv_train)
train = pd.read_csv("data/train_processed.csv")  # or read from combined_fe then filter split=='train'
X = train.drop(columns=["y"])
y = train["y"].astype(int)

# 1) Check if any feature equals y or is perfectly correlated
perfect_cols = []
for c in X.columns:
    if X[c].equals(y) or X[c].equals(1 - y):
        perfect_cols.append(c)
print("Features identical/inverted to y:", perfect_cols)

# 2) Check near-perfect single-feature AUCs
suspicious = []
for c in X.columns:
    try:
        auc = roc_auc_score(y, pd.Series(X[c]).fillna(pd.Series(X[c]).median()))
        if auc >= 0.999 or auc <= 0.001:
            suspicious.append((c, float(auc)))
    except Exception:
        pass
print("Suspicious (AUC ~ 1):", suspicious)

# 3) If using combined_fe, ensure 'split' isn't in features and that 'y' is not leaked
print("Columns that look like split/y flags present in X:",
      [c for c in X.columns if c.lower() in {"split","y","target","label"}])

FileNotFoundError: [Errno 2] No such file or directory: 'data/train_processed.csv'

In [13]:
import pandas as pd
from sklearn.metrics import roc_auc_score

# Point to the dataset you passed via --use-combined
path = r"data\combined_fe.parquet"  # or r"data\combined.parquet"
df = pd.read_parquet(path) if path.endswith(".parquet") else pd.read_csv(path)

train = df[df["split"]=="train"].drop(columns=["split"]).copy()
X = train.drop(columns=["y"])
y = train["y"].astype(int)

# 1) Exact matches to y (or inverted)
perfect = [c for c in X.columns if X[c].equals(y) or X[c].equals(1 - y)]
print("Features identical/inverted to y:", perfect)

# 2) Near-perfect single-feature AUCs
suspicious = []
for c in X.columns:
    try:
        v = X[c]
        auc = roc_auc_score(y, v.fillna(v.median()))
        if auc >= 0.999 or auc <= 0.001:
            suspicious.append((c, float(auc)))
    except Exception:
        pass
print("Suspicious (AUC ~ 1):", suspicious)

# 3) Columns that look like flags
print("Flag-like columns present:",
      [c for c in X.columns if c.lower() in {"split","y","target","label"}])


FileNotFoundError: [Errno 2] No such file or directory: 'data\\combined_fe.parquet'

In [14]:
cd "C:\Users\HP\Desktop\projects 2025\Binary Classification with a Bank Dataset"
git status

# Stage modified + new files
git add notebooks/Untitled.ipynb scripts/cv_train.py scripts/data_preprocessing.py scripts/data_integration.py scripts/feature_engineering.py

# Commit with a concise message
git commit -m "feat: add data integration & feature engineering; update CV training and preprocessing; tweak EDA notebook"

# Push to origin/main
git push

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1561518968.py, line 1)