In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
np.random.seed(42)

##Predict whether a telecom customer will churn (leave) next month

In [9]:
##read and describe the data 

df = pd.read_csv("/Users/shiva/Downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.info(memory_usage='deep')
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


(7043, 21)

In [5]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [7]:
##Predict whether a telecom customer will churn (leave) next month
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [11]:
##dataframe memory optimization

float_cols = df.select_dtypes(include=['float64']).columns
int_cols = df.select_dtypes(include=['int64']).columns

df[float_cols] = df[float_cols].astype('float32')
df[int_cols] = df[int_cols].astype('int32')

print("Memory optimization of column types:")
df.info(memory_usage='deep')

Memory optimization of column types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int32  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int32  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  Paper

In [13]:
##Parse transaction timestamp and derive features
##Dop unecessary ID/PII columns - Purpose: Remove columns that leak information or have no predictive value. 
##Why: IDs and PII add noise or risk leakage but rarely generalize.


df.drop(columns=['customerID','PaperlessBilling'],inplace=True)
df.columns
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int32  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int32  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaymentMethod     7043 non-null   object 
 16  MonthlyCharges    7043 non-null   float32


In [18]:
# churn_ratio = df['Churn'].mean()
# print(f"Churn fraction: {churn_ratio:.6f} ({churn_ratio*100:.3f}% )")

##Target imbalance
df['Churn'].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [19]:
df.dtypes.value_counts()

object     16
int32       2
float32     1
Name: count, dtype: int64

In [20]:
##Missing value summary
missing_pct = df.isna().mean().sort_values(ascending=False)
print(missing_pct[missing_pct>0])

Series([], dtype: float64)


In [21]:
##Categorical Cardinality
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols].nunique().sort_values(ascending=False)

TotalCharges        6531
PaymentMethod          4
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
gender                 2
Partner                2
Dependents             2
PhoneService           2
Churn                  2
dtype: int64

In [22]:
##Basic Statistics and outlier
##summarize distributions and sport anamolieis

df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
gender,7043.0,2.0,Male,3555.0,,,,,,,
SeniorCitizen,7043.0,,,,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
Partner,7043.0,2.0,No,3641.0,,,,,,,
Dependents,7043.0,2.0,No,4933.0,,,,,,,
tenure,7043.0,,,,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
PhoneService,7043.0,2.0,Yes,6361.0,,,,,,,
MultipleLines,7043.0,3.0,No,3390.0,,,,,,,
InternetService,7043.0,3.0,Fiber optic,3096.0,,,,,,,
OnlineSecurity,7043.0,3.0,No,3498.0,,,,,,,
OnlineBackup,7043.0,3.0,No,3088.0,,,,,,,


In [23]:
    # Class counts & ratios
    counts = df['Churn'].value_counts()
    ratios = df['Churn'].value_counts(normalize=True)
    print(pd.concat([counts, ratios], axis=1, keys=['count','ratio']))

       count    ratio
Churn                
No      5174  0.73463
Yes     1869  0.26537


In [26]:
##Leakage checks

# ✅ Only use numeric columns for correlation
# numeric_cols = df.select_dtypes(include=['int32']).columns
# num_corr = df[numeric_cols].corr()['Churn'].abs().sort_values(ascending=False)

# print("Top numeric correlations:")
# print(num_corr.head(10))

cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    try:
        if df.groupby(col)['Churn'].nunique().eq(1).all():
            print(f"⚠️ Potential leakage in '{col}' — perfect predictor")
    except Exception as e:
        print(f"Could not evaluate column {col}: {e}")

⚠️ Potential leakage in 'Churn' — perfect predictor


In [27]:
dup_count = df.duplicated().sum()
print(f"Duplicate rows: {dup_count}")
if dup_count > 0:
    df.drop_duplicates(inplace=True)
    print("Duplicates dropped")

Duplicate rows: 34
Duplicates dropped


In [31]:
import pyarrow
df.to_parquet(r'/Users/shiva/Downloads/telco_customer_cleaned.parquet', index=False)
print("Cleaned data saved to 'telco_customer_cleaned.parquet'")

Cleaned data saved to 'telco_customer_cleaned.parquet'


In [32]:
##Feature engineering steps
df = pd.read_parquet('/Users/shiva/Downloads/telco_customer_cleaned.parquet')

In [36]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7009 entries, 0 to 7008
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7009 non-null   object 
 1   SeniorCitizen     7009 non-null   int32  
 2   Partner           7009 non-null   object 
 3   Dependents        7009 non-null   object 
 4   tenure            7009 non-null   int32  
 5   PhoneService      7009 non-null   object 
 6   MultipleLines     7009 non-null   object 
 7   InternetService   7009 non-null   object 
 8   OnlineSecurity    7009 non-null   object 
 9   OnlineBackup      7009 non-null   object 
 10  DeviceProtection  7009 non-null   object 
 11  TechSupport       7009 non-null   object 
 12  StreamingTV       7009 non-null   object 
 13  StreamingMovies   7009 non-null   object 
 14  Contract          7009 non-null   object 
 15  PaymentMethod     7009 non-null   object 
 16  MonthlyCharges    7009 non-null   float32


In [37]:
# Impute missing values if present (example: none expected)
# Numeric: median, Categorical: mode or 'Unknown'
for col in df.columns:
    if df[col].isna().sum() > 0:
        if df[col].dtype in ['float32','float64','int32','int64']:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna('Unknown', inplace=True)
print("Missing values after imputation:", df.isna().sum().sum())

Missing values after imputation: 0


In [57]:
##Encoding categorical variables
from sklearn.preprocessing import LabelEncoder

df.columns
# if 'gender' in df.columns:
#     df['gender_enc'] = LabelEncoder().fit_transform(df['gender'])
#     df.drop(columns=['gender'], inplace=True)
# if 'Partner' in df.columns:
#     df['partner_enc'] = LabelEncoder().fit_transform(df['Partner'])
#     df.drop(columns=['Partner'], inplace=True)
# if 'Dependents' in df.columns:
#     df['dep_enc'] = LabelEncoder().fit_transform(df['Dependents'])
#     df.drop(columns=['Dependents'], inplace=True)
# if 'PhoneService' in df.columns:
#     df['phoneservice_enc'] = LabelEncoder().fit_transform(df['PhoneService'])
#     df.drop(columns=['PhoneService'], inplace=True)
# if 'OnlineSecurity' in df.columns:
#     df['onlineSec_enc'] = LabelEncoder().fit_transform(df['OnlineSecurity'])
#     df.drop(columns=['OnlineSecurity'], inplace=True)
# if 'OnlineBackup' in df.columns:
#     df['onlinebkp_enc'] = LabelEncoder().fit_transform(df['OnlineBackup'])
#     df.drop(columns=['OnlineBackup'], inplace=True)
# if 'DeviceProtection' in df.columns:
#     df['deviceProtection_enc'] = LabelEncoder().fit_transform(df['DeviceProtection'])
#     df.drop(columns=['DeviceProtection'], inplace=True)
# if 'TechSupport' in df.columns:
#     df['techsupport_enc'] = LabelEncoder().fit_transform(df['TechSupport'])
#     df.drop(columns=['TechSupport'], inplace=True)
# if 'Churn' in df.columns:
#     df['churn_enc'] = LabelEncoder().fit_transform(df['Churn'])
#     df['churn']=df['churn_enc']
#     df.drop(columns=['Churn_enc'], inplace=True)
    
# if 'StreamingTV' in df.columns:
#     df['streaming_enc'] = LabelEncoder().fit_transform(df['StreamingTV'])
#     df.drop(columns=['StreamingTV'], inplace=True)
# if 'StreamingMovies' in df.columns:
#     df['streamMovies_enc'] = LabelEncoder().fit_transform(df['StreamingMovies'])
#     df.drop(columns=['StreamingMovies'], inplace=True)
# if 'StreamingMovies' in df.columns:
#     df['streamMovies_enc'] = LabelEncoder().fit_transform(df['StreamingMovies'])
#     df.drop(columns=['StreamingMovies'], inplace=True)

# columns you want to label‑encode → new column name
ENCODE_MAP = {
    "gender":         "gender_enc",
    "Partner":        "partner_enc",
    "Dependents":     "dep_enc",
    "PhoneService":   "phoneservice_enc",
    "OnlineSecurity": "onlineSec_enc",
    "OnlineBackup":   "onlinebkp_enc",
    "DeviceProtection":"deviceProtection_enc",
    "TechSupport":    "techsupport_enc",
    "StreamingTV":    "streamingTV_enc",
    "StreamingMovies":"streamMovies_enc",
    "Churn":          "churn"          # map label → final target
}

encoders = {}                      # (optional) keep fitted encoders

for col, new_col in ENCODE_MAP.items():
    if col in df.columns:
        le = LabelEncoder()
        df[new_col] = le.fit_transform(df[col])
        encoders[col] = le          # store for inverse‑transform / inference
        df.drop(columns=[col], inplace=True)
df.head(5)

Unnamed: 0,SeniorCitizen,tenure,MultipleLines,MonthlyCharges,TotalCharges,gender_enc,partner_enc,dep_enc,phoneservice_enc,onlineSec_enc,...,streaming_enc,streamMovies_enc,churn_enc,cat_Fiber optic,cat_No,cat_One year,cat_Two year,cat_Credit card (automatic),cat_Electronic check,cat_Mailed check
0,0,1,No phone service,29.85,29.85,0,1,0,0,0,...,0,0,0,False,False,False,False,False,True,False
1,0,34,No,56.950001,1889.5,1,0,0,1,2,...,0,0,0,False,False,True,False,False,False,True
2,0,2,No,53.849998,108.15,1,0,0,1,2,...,0,0,1,False,False,False,False,False,False,True
3,0,45,No phone service,42.299999,1840.75,1,0,0,0,2,...,0,0,0,False,False,True,False,False,False,False
4,0,2,No,70.699997,151.65,0,0,0,1,0,...,0,0,1,True,False,False,False,False,True,False


In [58]:
# # One-hot for low-cardinality 'category'
# df = pd.concat([df, pd.get_dummies(df['InternetService'], prefix='cat', drop_first=True)], axis=1)
# df = pd.concat([df, pd.get_dummies(df['Contract'], prefix='cat', drop_first=True)], axis=1)
# df = pd.concat([df, pd.get_dummies(df['PaymentMethod'], prefix='cat', drop_first=True)], axis=1)
#df.drop(columns=['PaymentMethod','InternetService','Contract','MultipleLines'], inplace=True)
df = pd.concat([df, pd.get_dummies(df['MultipleLines'], prefix='cat', drop_first=True)], axis=1)
df.drop(columns=['MultipleLines'], inplace=True)
df.head(5)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_enc,partner_enc,dep_enc,phoneservice_enc,onlineSec_enc,onlinebkp_enc,...,churn_enc,cat_Fiber optic,cat_No,cat_One year,cat_Two year,cat_Credit card (automatic),cat_Electronic check,cat_Mailed check,cat_No phone service,cat_Yes
0,0,1,29.85,29.85,0,1,0,0,0,2,...,0,False,False,False,False,False,True,False,True,False
1,0,34,56.950001,1889.5,1,0,0,1,2,0,...,0,False,False,True,False,False,False,True,False,False
2,0,2,53.849998,108.15,1,0,0,1,2,2,...,1,False,False,False,False,False,False,True,False,False
3,0,45,42.299999,1840.75,1,0,0,0,2,0,...,0,False,False,True,False,False,False,False,True,False
4,0,2,70.699997,151.65,0,0,0,1,0,0,...,1,True,False,False,False,False,True,False,False,False


In [62]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7009 entries, 0 to 7008
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   SeniorCitizen                7009 non-null   int32  
 1   tenure                       7009 non-null   int32  
 2   MonthlyCharges               7009 non-null   float32
 3   TotalCharges                 7009 non-null   object 
 4   gender_enc                   7009 non-null   int64  
 5   partner_enc                  7009 non-null   int64  
 6   dep_enc                      7009 non-null   int64  
 7   phoneservice_enc             7009 non-null   int64  
 8   onlineSec_enc                7009 non-null   int64  
 9   onlinebkp_enc                7009 non-null   int64  
 10  deviceProtection_enc         7009 non-null   int64  
 11  techsupport_enc              7009 non-null   int64  
 12  streaming_enc                7009 non-null   int64  
 13  streamMovies_enc  

In [63]:
##Log Transform skewed numeric features

import numpy as np

# Log transform amount
df['mnthCharges_log'] = np.log1p(df['MonthlyCharges'])
df.drop(columns=['MonthlyCharges'], inplace=True)

# Check skew
print("Skewness mnthCharges_log:", df['mnthCharges_log'].skew())

Skewness mnthCharges_log: -0.7286593


In [66]:
# Log transform amount
# Suppose df["TotalCharges"] is object dtype
df["TotalCharges"] = (
    pd.to_numeric(df["TotalCharges"], errors="coerce")   # strings → numeric, bad values → NaN
      .astype(np.float32)                                # down‑cast to float32
)
df['totalchrgs_log'] = np.log1p(df['TotalCharges'])
df.drop(columns=['TotalCharges'], inplace=True)

# Check skew
print("Skewness totalchrgs_log:", df['totalchrgs_log'].skew())

Skewness totalchrgs_log: -0.7394558


In [67]:
##Feature scalling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scale_cols = ['totalchrgs_log','mnthCharges_log']
df[scale_cols] = scaler.fit_transform(df[scale_cols])

df[scale_cols].head()

Unnamed: 0,totalchrgs_log,mnthCharges_log
0,-2.30984,-1.067616
1,0.381721,0.022089
2,-1.483439,-0.07294
3,0.364634,-0.481636
4,-1.264068,0.390098


In [69]:
##Final Feature Matrix
df = df.rename(columns={"churn_enc": "churn"})
# Drop any remaining raw columns if needed
# Prepare X and y
import numpy as np
# Keep only numeric columns
X = df.drop(columns=['churn'])

# X.select_dtypes(exclude=['number']).columns
# Fill missing or problematic values
X = X.fillna(0)
X = X.replace([np.inf, -np.inf], 0)

# Confirm shape and types
print(X.dtypes)
print(X.shape)



# X = df.drop(columns=['trans_ts','is_fraud','lat','long','merch_lat','merch_long','dob'])
# y = df['is_fraud']
print("Final feature matrix shape:", X.shape)
print("Features:", X.columns.tolist())

SeniorCitizen                    int32
tenure                           int32
gender_enc                       int64
partner_enc                      int64
dep_enc                          int64
phoneservice_enc                 int64
onlineSec_enc                    int64
onlinebkp_enc                    int64
deviceProtection_enc             int64
techsupport_enc                  int64
streaming_enc                    int64
streamMovies_enc                 int64
cat_Fiber optic                   bool
cat_No                            bool
cat_One year                      bool
cat_Two year                      bool
cat_Credit card (automatic)       bool
cat_Electronic check              bool
cat_Mailed check                  bool
cat_No phone service              bool
cat_Yes                           bool
mnthCharges_log                float32
totalchrgs_log                 float32
dtype: object
(7009, 23)
Final feature matrix shape: (7009, 23)
Features: ['SeniorCitizen', 'tenure', '

In [70]:
df.to_parquet(r'/Users/shiva/Downloads/telco_customer_cleaned_featured.parquet', index=False)
print("Featured data saved to 'telco_customer_cleaned_featured.parquet'")


Featured data saved to 'telco_customer_cleaned_featured.parquet'


In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_parquet(r'/Users/shiva/Downloads/telco_customer_cleaned_featured.parquet')
X = df.drop(columns=['churn'])
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
print(X_train.shape, X_test.shape, y_train.mean())

(5607, 23) (1402, 23) 0.26431246655965757


In [90]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline   # imbalanced‑learn wrapper

# ───────────────────────────────────────────────────────────
# 1.  Feature‑type splits
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include='object').columns

numeric_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe',    OneHotEncoder(handle_unknown='ignore'))
])

preproc = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# ───────────────────────────────────────────────────────────
# 2.  Full pipeline:  Impute/Encode  →  SMOTE  →  RandomForest
rf_pipe = ImbPipeline([
    ('pre',   preproc),
    ('smote', SMOTE(random_state=42)),
    ('clf',   RandomForestClassifier(
                  n_estimators=400,
                  max_depth=None,
                  min_samples_leaf=2,
                  class_weight='balanced',   # handles remaining imbalance
                  random_state=42,
                  n_jobs=-1
    ))
])
from sklearn.metrics import classification_report, roc_auc_score,average_precision_score
# Example fit / evaluate
rf_pipe.fit(X_train, y_train)
y_pred   = rf_pipe.predict(X_test)
y_proba  = rf_pipe.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred, digits=3))
print("ROC‑AUC:", roc_auc_score(y_test, y_proba).round(3))

from sklearn.metrics import classification_report, roc_auc_score,average_precision_score

##validate
# rf_pipe.fit(X_train, y_train)        # should run without NaN error
print("AP:", average_precision_score(y_test, rf_pipe.predict_proba(X_test)[:,1]))

# ap = average_precision_score(y_test, y_proba)
# print("Average Precision (PR‑AUC):", round(ap, 3))

              precision    recall  f1-score   support

           0      0.846     0.731     0.785      1031
           1      0.458     0.631     0.531       371

    accuracy                          0.705      1402
   macro avg      0.652     0.681     0.658      1402
weighted avg      0.743     0.705     0.717      1402

ROC‑AUC: 0.741
AP: 0.4694126925348697
Average Precision (PR‑AUC): 0.469


In [86]:
# ##Construct pipeline with smote oversampling 

# from imblearn.over_sampling import SMOTE
# from imblearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier,HistGradientBoostingClassifier

# pipe = Pipeline([
#     ('smote', SMOTE(random_state=42)),
#     ('clf', HistGradientBoostingClassifier(   # name it ‘clf’ or ‘hgb’
#         learning_rate=0.1,
#         max_depth=None,
#         # remove n_jobs
#         # remove class_weight OR upgrade to sklearn>=1.1
#         random_state=42
#     ))
# ])

# from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
# from scipy.stats import randint, uniform

# param_dist = {
#     # number of boosting iterations
#     "clf__max_iter": randint(300, 1000),          # instead of n_estimators
#     "clf__max_depth": randint(5, 20),
#     # remove min_samples_split  ← not supported
#     "clf__min_samples_leaf": randint(1, 5),
#     "clf__max_features": uniform(0.2, 0.6),       # float ∈ (0,1] OK
#     # optional extra knobs that HGB does have:
#     # "clf__learning_rate": uniform(0.03, 0.15),
#     # "clf__l2_regularization": uniform(0.0, 1.0)
# }
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# search = RandomizedSearchCV(
#     estimator=pipe,
#     param_distributions=param_dist,
#     n_iter=30,
#     scoring="average_precision",
#     cv=cv,
#     n_jobs=-1,
#     random_state=42,
#     verbose=2,
#     error_score="raise"
# )

# ##Fit the best model

# search.fit(X_train, y_train)
# print("Best PR-AUC:", search.best_score_)
# print("Best params:", search.best_params_)

In [88]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import HistGradientBoostingClassifier

# 1. Preprocessing transformers (numeric + categorical)
numeric_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),      # <-- fill NaN here
    ('scale', StandardScaler())
])
categorical_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

pre = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 2. Full imbalanced pipeline
pipe = ImbPipeline([
    ('pre', pre),                       # imputes inside ColumnTransformer
    ('smote', SMOTE(random_state=42)),  # now sees no NaN
    ('clf', HistGradientBoostingClassifier(
        max_iter=500,
        max_depth=None,
        min_samples_leaf=2,
        random_state=42
    ))
])

from sklearn.metrics import classification_report, roc_auc_score,average_precision_score

##validate
pipe.fit(X_train, y_train)        # should run without NaN error
print("AP:", average_precision_score(y_test, pipe.predict_proba(X_test)[:,1]))


AP: 0.4450124270874489


In [80]:
# ##HyperparameterTuning using RandomSearch
# from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
# from scipy.stats import randint, uniform

# param_dist = {
#     'clf__n_estimators': randint(300, 1000),
#     'clf__max_depth': randint(5, 20),
#     'clf__min_samples_split': randint(2, 10),
#     'clf__min_samples_leaf': randint(1, 5),
#     'clf__max_features': uniform(0.2, 0.6)
# }

# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# search = RandomizedSearchCV(
#     estimator=pipe,
#     param_distributions=param_dist,
#     n_iter=30,
#     scoring='average_precision',
#     cv=cv,
#     n_jobs=-1,
#     random_state=42,
#     verbose=2,
#     error_score='raise'  # for debugging
# )

In [83]:
# ##Fit the best model

# search.fit(X_train, y_train)
# print("Best PR-AUC:", search.best_score_)
# print("Best params:", search.best_params_)

In [91]:
##Model Evaluation

from sklearn.metrics import average_precision_score, roc_auc_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

print("Test PR-AUC:", average_precision_score(y_test, y_proba))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test)
plt.show()

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'