In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
np.random.seed(42)

In [3]:
df = pd.read_csv("/Users/shiva/Downloads/churn_user_features.csv")
df.info(memory_usage='deep')
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              10000 non-null  int64  
 1   user_id                 10000 non-null  object 
 2   age                     10000 non-null  int64  
 3   region                  10000 non-null  object 
 4   device_type             10000 non-null  object 
 5   current_tenure          10000 non-null  int64  
 6   time_to_renewal         10000 non-null  int64  
 7   promo_flag              10000 non-null  bool   
 8   total_hours_streamed    10000 non-null  float64
 9   avg_session_length      10000 non-null  float64
 10  avg_startup_latency     10000 non-null  float64
 11  p95_startup_latency     10000 non-null  float64
 12  total_rebuffers         10000 non-null  int64  
 13  rebuffers_per_session   10000 non-null  float64
 14  avg_throughput          10000 non-null 

(10000, 22)

In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,age,current_tenure,time_to_renewal,total_hours_streamed,avg_session_length,avg_startup_latency,p95_startup_latency,total_rebuffers,rebuffers_per_session,avg_throughput,slow_throughput_rate,avg_jitter,month_hours_streamed,month_session_count,usage_trend_slope,ticket_count,days_since_last_ticket
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,3945.0
mean,4999.5,44.0217,194.882,-90.6077,10.004899,1.745234,551.054162,650.95405,2.0048,1.488886,5.493334,0.504703,106.169214,20.007668,24.9267,-0.055471,0.5052,30.213942
std,2886.89568,15.203998,97.217962,105.751336,7.077026,0.727252,258.016198,264.366532,1.421682,0.864787,2.590829,0.28983,54.684132,14.270783,14.219668,2.897946,0.715698,17.17804
min,0.0,18.0,30.0,-333.0,0.068293,0.500365,100.251654,108.950956,0.0,7e-06,1.001808,1.8e-05,10.002074,0.103298,1.0,-4.999513,0.0,1.0
25%,2499.75,31.0,111.0,-174.0,4.831897,1.115773,329.782699,428.515932,1.0,0.742734,3.259962,0.251836,59.372482,9.599375,13.0,-2.610195,0.0,15.0
50%,4999.5,44.0,194.0,-90.0,8.378878,1.734848,550.146066,651.479629,2.0,1.478008,5.476932,0.505851,106.968069,16.778718,25.0,-0.046222,0.0,30.0
75%,7499.25,57.0,278.0,-7.0,13.391402,2.38201,773.246005,871.151982,3.0,2.223322,7.768558,0.755832,153.666978,26.948818,37.0,2.473903,1.0,45.0
max,9999.0,70.0,364.0,144.0,58.590612,2.999732,999.733113,1196.823829,10.0,2.999807,9.998971,0.999984,199.992585,119.161297,49.0,4.99975,5.0,59.0


In [5]:
##Predict whether a customer will churn (leave) next month
df.head(5)

Unnamed: 0.1,Unnamed: 0,user_id,age,region,device_type,current_tenure,time_to_renewal,promo_flag,total_hours_streamed,avg_session_length,...,total_rebuffers,rebuffers_per_session,avg_throughput,slow_throughput_rate,avg_jitter,month_hours_streamed,month_session_count,usage_trend_slope,ticket_count,days_since_last_ticket
0,0,U00001,56,US,mobile,356,-231,False,8.644639,2.70579,...,5,1.426019,8.120058,0.948775,194.134075,6.875991,45,1.066576,1,4.0
1,1,U00002,69,EU,smart_tv,296,-140,False,3.677074,2.365369,...,3,2.218157,1.790549,0.314351,66.308197,23.163118,3,4.633615,0,
2,2,U00003,46,US,mobile,299,-209,False,8.03953,0.547339,...,3,2.708621,1.700032,0.228268,129.994075,46.16592,6,1.414259,0,
3,3,U00004,32,LATAM,mobile,157,-54,False,4.930183,2.524625,...,2,1.920391,4.740932,0.902906,162.650231,16.303295,15,-2.275332,0,
4,4,U00005,60,MEA,tablet,293,-232,False,7.194304,0.567824,...,1,2.259991,8.67375,0.500406,24.976007,30.194688,22,-2.600993,0,


In [6]:
##dataframe memory optimization

float_cols = df.select_dtypes(include=['float64']).columns
int_cols = df.select_dtypes(include=['int64']).columns

df[float_cols] = df[float_cols].astype('float32')
df[int_cols] = df[int_cols].astype('int32')

print("Memory optimization of column types:")
df.info(memory_usage='deep')

Memory optimization of column types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              10000 non-null  int32  
 1   user_id                 10000 non-null  object 
 2   age                     10000 non-null  int32  
 3   region                  10000 non-null  object 
 4   device_type             10000 non-null  object 
 5   current_tenure          10000 non-null  int32  
 6   time_to_renewal         10000 non-null  int32  
 7   promo_flag              10000 non-null  bool   
 8   total_hours_streamed    10000 non-null  float32
 9   avg_session_length      10000 non-null  float32
 10  avg_startup_latency     10000 non-null  float32
 11  p95_startup_latency     10000 non-null  float32
 12  total_rebuffers         10000 non-null  int32  
 13  rebuffers_per_session   10000 non-null  float32
 14  av

In [None]:
##Parse transaction timestamp and derive features
##Dop unecessary ID/PII columns - Purpose: Remove columns that leak information or have no predictive value. 
##Why: IDs and PII add noise or risk leakage but rarely generalize.


# df.drop(columns=['customerID','PaperlessBilling'],inplace=True)
# df.columns
# df.info(memory_usage='deep')

In [8]:
# Drop card number, transaction ID, personal identifiers
df.drop(columns=['Unnamed: 0'], inplace=True)

# Preview remaining columns
df.columns

Index(['user_id', 'age', 'region', 'device_type', 'current_tenure',
       'time_to_renewal', 'promo_flag', 'total_hours_streamed',
       'avg_session_length', 'avg_startup_latency', 'p95_startup_latency',
       'total_rebuffers', 'rebuffers_per_session', 'avg_throughput',
       'slow_throughput_rate', 'avg_jitter', 'month_hours_streamed',
       'month_session_count', 'usage_trend_slope', 'ticket_count',
       'days_since_last_ticket'],
      dtype='object')

In [9]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 10000 non-null  object 
 1   age                     10000 non-null  int32  
 2   region                  10000 non-null  object 
 3   device_type             10000 non-null  object 
 4   current_tenure          10000 non-null  int32  
 5   time_to_renewal         10000 non-null  int32  
 6   promo_flag              10000 non-null  bool   
 7   total_hours_streamed    10000 non-null  float32
 8   avg_session_length      10000 non-null  float32
 9   avg_startup_latency     10000 non-null  float32
 10  p95_startup_latency     10000 non-null  float32
 11  total_rebuffers         10000 non-null  int32  
 12  rebuffers_per_session   10000 non-null  float32
 13  avg_throughput          10000 non-null  float32
 14  slow_throughput_rate    10000 non-null 

In [10]:
df.head(5)

Unnamed: 0,user_id,age,region,device_type,current_tenure,time_to_renewal,promo_flag,total_hours_streamed,avg_session_length,avg_startup_latency,...,total_rebuffers,rebuffers_per_session,avg_throughput,slow_throughput_rate,avg_jitter,month_hours_streamed,month_session_count,usage_trend_slope,ticket_count,days_since_last_ticket
0,U00001,56,US,mobile,356,-231,False,8.644639,2.70579,752.148376,...,5,1.426019,8.120058,0.948775,194.134079,6.87599,45,1.066576,1,4.0
1,U00002,69,EU,smart_tv,296,-140,False,3.677074,2.365369,638.462219,...,3,2.218158,1.790549,0.314351,66.308197,23.163118,3,4.633615,0,
2,U00003,46,US,mobile,299,-209,False,8.03953,0.547339,367.117706,...,3,2.708621,1.700032,0.228268,129.99408,46.16592,6,1.414259,0,
3,U00004,32,LATAM,mobile,157,-54,False,4.930182,2.524625,290.034729,...,2,1.920391,4.740932,0.902906,162.650238,16.303295,15,-2.275332,0,
4,U00005,60,MEA,tablet,293,-232,False,7.194304,0.567824,361.935791,...,1,2.25999,8.67375,0.500406,24.976007,30.194689,22,-2.600992,0,


In [11]:
##Calculate the formula for churn

##define rules - 
##1. when user  

df['churn_label'] = ((df['time_to_renewal'] >= 0) &
                     (df['time_to_renewal'] <= 30)).astype(int)

In [12]:
df.head(5)

Unnamed: 0,user_id,age,region,device_type,current_tenure,time_to_renewal,promo_flag,total_hours_streamed,avg_session_length,avg_startup_latency,...,rebuffers_per_session,avg_throughput,slow_throughput_rate,avg_jitter,month_hours_streamed,month_session_count,usage_trend_slope,ticket_count,days_since_last_ticket,churn_label
0,U00001,56,US,mobile,356,-231,False,8.644639,2.70579,752.148376,...,1.426019,8.120058,0.948775,194.134079,6.87599,45,1.066576,1,4.0,0
1,U00002,69,EU,smart_tv,296,-140,False,3.677074,2.365369,638.462219,...,2.218158,1.790549,0.314351,66.308197,23.163118,3,4.633615,0,,0
2,U00003,46,US,mobile,299,-209,False,8.03953,0.547339,367.117706,...,2.708621,1.700032,0.228268,129.99408,46.16592,6,1.414259,0,,0
3,U00004,32,LATAM,mobile,157,-54,False,4.930182,2.524625,290.034729,...,1.920391,4.740932,0.902906,162.650238,16.303295,15,-2.275332,0,,0
4,U00005,60,MEA,tablet,293,-232,False,7.194304,0.567824,361.935791,...,2.25999,8.67375,0.500406,24.976007,30.194689,22,-2.600992,0,,0


In [14]:
# Fraction of fraud cases
fraud_ratio = df['churn_label'].mean()
print(f"churn fraction: {fraud_ratio:.6f} ({fraud_ratio*100:.3f}% )")
df['churn_label'].value_counts()

churn fraction: 0.085600 (8.560% )


churn_label
0    9144
1     856
Name: count, dtype: int64

In [15]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 10000 non-null  object 
 1   age                     10000 non-null  int32  
 2   region                  10000 non-null  object 
 3   device_type             10000 non-null  object 
 4   current_tenure          10000 non-null  int32  
 5   time_to_renewal         10000 non-null  int32  
 6   promo_flag              10000 non-null  bool   
 7   total_hours_streamed    10000 non-null  float32
 8   avg_session_length      10000 non-null  float32
 9   avg_startup_latency     10000 non-null  float32
 10  p95_startup_latency     10000 non-null  float32
 11  total_rebuffers         10000 non-null  int32  
 12  rebuffers_per_session   10000 non-null  float32
 13  avg_throughput          10000 non-null  float32
 14  slow_throughput_rate    10000 non-null 

In [16]:
missing_pct = df.isna().mean().sort_values(ascending=False)
print(missing_pct[missing_pct>0])

days_since_last_ticket    0.6055
dtype: float64


In [17]:
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols].nunique().sort_values(ascending=False)

user_id        10000
region             5
device_type        4
dtype: int64

In [18]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
user_id,10000.0,10000.0,U00001,1.0,,,,,,,
age,10000.0,,,,44.0217,15.203998,18.0,31.0,44.0,57.0,70.0
region,10000.0,5.0,EU,2056.0,,,,,,,
device_type,10000.0,4.0,tablet,2548.0,,,,,,,
current_tenure,10000.0,,,,194.882,97.217962,30.0,111.0,194.0,278.0,364.0
time_to_renewal,10000.0,,,,-90.6077,105.751336,-333.0,-174.0,-90.0,-7.0,144.0
promo_flag,10000.0,2.0,False,7038.0,,,,,,,
total_hours_streamed,10000.0,,,,10.004899,7.077026,0.068293,4.831897,8.378878,13.391402,58.590611
avg_session_length,10000.0,,,,1.745234,0.727252,0.500365,1.115773,1.734848,2.38201,2.999732
avg_startup_latency,10000.0,,,,551.054138,258.016205,100.251656,329.782692,550.146088,773.246017,999.733093


In [19]:
# Class counts & ratios
counts = df['churn_label'].value_counts()
ratios = df['churn_label'].value_counts(normalize=True)
print(pd.concat([counts, ratios], axis=1, keys=['count','ratio']))

             count   ratio
churn_label               
0             9144  0.9144
1              856  0.0856


In [20]:
# Only select numeric columns for correlation
num_cols = df.select_dtypes(include=['number']).columns

# Compute correlations with 'is_fraud'
num_corr = df[num_cols].corr()['churn_label'].abs().sort_values(ascending=False)

print("Top numeric correlations with churn_label:")
print(num_corr.head(10))


Top numeric correlations with churn_label:
churn_label              1.000000
current_tenure           0.308914
time_to_renewal          0.303332
total_rebuffers          0.018075
avg_jitter               0.016897
age                      0.012470
month_hours_streamed     0.012293
total_hours_streamed     0.011993
rebuffers_per_session    0.007801
ticket_count             0.007266
Name: churn_label, dtype: float64


In [21]:
##Leakage checks

# ✅ Only use numeric columns for correlation
# numeric_cols = df.select_dtypes(include=['int32']).columns
# num_corr = df[numeric_cols].corr()['Churn'].abs().sort_values(ascending=False)

# print("Top numeric correlations:")
# print(num_corr.head(10))

cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    try:
        if df.groupby(col)['churn_label'].nunique().eq(1).all():
            print(f"⚠️ Potential leakage in '{col}' — perfect predictor")
    except Exception as e:
        print(f"Could not evaluate column {col}: {e}")

⚠️ Potential leakage in 'user_id' — perfect predictor


In [22]:
dup_count = df.duplicated().sum()
print(f"Duplicate rows: {dup_count}")
if dup_count > 0:
    df.drop_duplicates(inplace=True)
    print("Duplicates dropped")

Duplicate rows: 0


In [23]:
import pyarrow
df.to_parquet(r'/Users/shiva/Downloads/churn_prediction_cleaned.parquet', index=False)
print("Cleaned data saved to 'churn_prediction_cleaned.parquet'")

Cleaned data saved to 'churn_prediction_cleaned.parquet'


In [35]:
##Feature engineering steps
df = pd.read_parquet('/Users/shiva/Downloads/churn_prediction_cleaned.parquet')

In [36]:
df.info(memory_usage='deep')

cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols].nunique().sort_values(ascending=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 10000 non-null  object 
 1   age                     10000 non-null  int32  
 2   region                  10000 non-null  object 
 3   device_type             10000 non-null  object 
 4   current_tenure          10000 non-null  int32  
 5   time_to_renewal         10000 non-null  int32  
 6   promo_flag              10000 non-null  bool   
 7   total_hours_streamed    10000 non-null  float32
 8   avg_session_length      10000 non-null  float32
 9   avg_startup_latency     10000 non-null  float32
 10  p95_startup_latency     10000 non-null  float32
 11  total_rebuffers         10000 non-null  int32  
 12  rebuffers_per_session   10000 non-null  float32
 13  avg_throughput          10000 non-null  float32
 14  slow_throughput_rate    10000 non-null 

user_id        10000
region             5
device_type        4
dtype: int64

In [37]:
##Encoding categorical variables
from sklearn.preprocessing import LabelEncoder

df.columns

# columns you want to label‑encode → new column name
ENCODE_MAP = {
    "user_id":         "userid"
    # "region":          "region_all",
    # "device_type":     "devicetype"# map label → final target
}

encoders = {}                      # (optional) keep fitted encoders

for col, new_col in ENCODE_MAP.items():
    if col in df.columns:
        le = LabelEncoder()
        df[new_col] = le.fit_transform(df[col])
        encoders[col] = le          # store for inverse‑transform / inference
        df.drop(columns=[col], inplace=True)
df.head(5)

Unnamed: 0,age,region,device_type,current_tenure,time_to_renewal,promo_flag,total_hours_streamed,avg_session_length,avg_startup_latency,p95_startup_latency,...,avg_throughput,slow_throughput_rate,avg_jitter,month_hours_streamed,month_session_count,usage_trend_slope,ticket_count,days_since_last_ticket,churn_label,userid
0,56,US,mobile,356,-231,False,8.644639,2.70579,752.148376,864.831909,...,8.120058,0.948775,194.134079,6.87599,45,1.066576,1,4.0,0,0
1,69,EU,smart_tv,296,-140,False,3.677074,2.365369,638.462219,754.223267,...,1.790549,0.314351,66.308197,23.163118,3,4.633615,0,,0,1
2,46,US,mobile,299,-209,False,8.03953,0.547339,367.117706,463.690247,...,1.700032,0.228268,129.99408,46.16592,6,1.414259,0,,0,2
3,32,LATAM,mobile,157,-54,False,4.930182,2.524625,290.034729,317.688934,...,4.740932,0.902906,162.650238,16.303295,15,-2.275332,0,,0,3
4,60,MEA,tablet,293,-232,False,7.194304,0.567824,361.935791,416.324799,...,8.67375,0.500406,24.976007,30.194689,22,-2.600992,0,,0,4


In [42]:
df = pd.concat([df, pd.get_dummies(df['device_type'], prefix='cat', drop_first=True)], axis=1)
df = pd.concat([df, pd.get_dummies(df['region'], prefix='cat', drop_first=True)], axis=1)
df.drop(columns=['region','device_type'], inplace=True)
df.head(5)


# cat_cols = df.select_dtypes(include=['object']).columns
# df[cat_cols].nunique().sort_values(ascending=False)



Unnamed: 0,age,current_tenure,time_to_renewal,promo_flag,total_hours_streamed,avg_session_length,avg_startup_latency,p95_startup_latency,total_rebuffers,rebuffers_per_session,...,days_since_last_ticket,churn_label,userid,cat_mobile,cat_smart_tv,cat_tablet,cat_EU,cat_LATAM,cat_MEA,cat_US
0,56,356,-231,False,8.644639,2.70579,752.148376,864.831909,5,1.426019,...,4.0,0,0,True,False,False,False,False,False,True
1,69,296,-140,False,3.677074,2.365369,638.462219,754.223267,3,2.218158,...,,0,1,False,True,False,True,False,False,False
2,46,299,-209,False,8.03953,0.547339,367.117706,463.690247,3,2.708621,...,,0,2,True,False,False,False,False,False,True
3,32,157,-54,False,4.930182,2.524625,290.034729,317.688934,2,1.920391,...,,0,3,True,False,False,False,True,False,False
4,60,293,-232,False,7.194304,0.567824,361.935791,416.324799,1,2.25999,...,,0,4,False,False,True,False,False,True,False


In [43]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     10000 non-null  int32  
 1   current_tenure          10000 non-null  int32  
 2   time_to_renewal         10000 non-null  int32  
 3   promo_flag              10000 non-null  bool   
 4   total_hours_streamed    10000 non-null  float32
 5   avg_session_length      10000 non-null  float32
 6   avg_startup_latency     10000 non-null  float32
 7   p95_startup_latency     10000 non-null  float32
 8   total_rebuffers         10000 non-null  int32  
 9   rebuffers_per_session   10000 non-null  float32
 10  avg_throughput          10000 non-null  float32
 11  slow_throughput_rate    10000 non-null  float32
 12  avg_jitter              10000 non-null  float32
 13  month_hours_streamed    10000 non-null  float32
 14  month_session_count     10000 non-null 

In [44]:
##Log Transform skewed numeric features

import numpy as np

# 1. Identify your numeric columns
numeric_cols = df.select_dtypes(include='number').columns

# 2. Compute skewness
skewness = df[numeric_cols].skew()
# # Log transform amount
# df['mnthCharges_log'] = np.log1p(df['MonthlyCharges'])
# df.drop(columns=['MonthlyCharges'], inplace=True)

# # Check skew
# print("Skewness mnthCharges_log:", df['mnthCharges_log'].skew())

In [46]:
high_skew = skewness[skewness>1]
for col in high_skew.index:
    df[f"{col}_log"] = np.log1p(df[col])


In [48]:
# df[[f"{col}_log" for col in high_skew.index]].skew()
df.head(5)

Unnamed: 0,age,current_tenure,time_to_renewal,promo_flag,total_hours_streamed,avg_session_length,avg_startup_latency,p95_startup_latency,total_rebuffers,rebuffers_per_session,...,cat_smart_tv,cat_tablet,cat_EU,cat_LATAM,cat_MEA,cat_US,total_hours_streamed_log,month_hours_streamed_log,ticket_count_log,churn_label_log
0,56,356,-231,False,8.644639,2.70579,752.148376,864.831909,5,1.426019,...,False,False,False,False,False,True,2.266402,2.063819,0.693147,0.0
1,69,296,-140,False,3.677074,2.365369,638.462219,754.223267,3,2.218158,...,True,False,True,False,False,False,1.542673,3.184827,0.0,0.0
2,46,299,-209,False,8.03953,0.547339,367.117706,463.690247,3,2.708621,...,False,False,False,False,False,True,2.201607,3.853672,0.0,0.0
3,32,157,-54,False,4.930182,2.524625,290.034729,317.688934,2,1.920391,...,False,False,False,True,False,False,1.780055,2.850897,0.0,0.0
4,60,293,-232,False,7.194304,0.567824,361.935791,416.324799,1,2.25999,...,False,True,False,False,True,False,2.103439,3.440248,0.0,0.0


In [49]:
##Final Feature Matrix
df = df.rename(columns={"churn": "churn_label"})
# Drop any remaining raw columns if needed
# Prepare X and y
import numpy as np
# Keep only numeric columns
X = df.drop(columns=['churn_label_log'])

# X.select_dtypes(exclude=['number']).columns
# Fill missing or problematic values
X = X.fillna(0)
X = X.replace([np.inf, -np.inf], 0)

# Confirm shape and types
print(X.dtypes)
print(X.shape)



# X = df.drop(columns=['trans_ts','is_fraud','lat','long','merch_lat','merch_long','dob'])
# y = df['is_fraud']
print("Final feature matrix shape:", X.shape)
print("Features:", X.columns.tolist())

age                           int32
current_tenure                int32
time_to_renewal               int32
promo_flag                     bool
total_hours_streamed        float32
avg_session_length          float32
avg_startup_latency         float32
p95_startup_latency         float32
total_rebuffers               int32
rebuffers_per_session       float32
avg_throughput              float32
slow_throughput_rate        float32
avg_jitter                  float32
month_hours_streamed        float32
month_session_count           int32
usage_trend_slope           float32
ticket_count                  int32
days_since_last_ticket      float32
churn_label                   int64
userid                        int64
cat_mobile                     bool
cat_smart_tv                   bool
cat_tablet                     bool
cat_EU                         bool
cat_LATAM                      bool
cat_MEA                        bool
cat_US                         bool
total_hours_streamed_log    

In [50]:
df.to_parquet(r'/Users/shiva/Downloads/churn_prediction_cleaned_featured.parquet', index=False)
print("Featured data saved to 'churn_prediction_cleaned_featured.parquet'")

Featured data saved to 'churn_prediction_cleaned_featured.parquet'


In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_parquet(r'/Users/shiva/Downloads/churn_prediction_cleaned_featured.parquet')
X = df.drop(columns=['churn_label'])
y = df['churn_label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
print(X_train.shape, X_test.shape, y_train.mean())

(8000, 30) (2000, 30) 0.085625


In [55]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline   # imbalanced‑learn wrapper

# ───────────────────────────────────────────────────────────
# 1.  Feature‑type splits
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include='object').columns

numeric_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe',    OneHotEncoder(handle_unknown='ignore'))
])

preproc = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])


# ───────────────────────────────────────────────────────────
# 2.  Full pipeline:  Impute/Encode  →  SMOTE  →  RandomForest
rf_pipe = ImbPipeline([
    ('pre',   preproc),
    ('smote', SMOTE(random_state=42)),
    ('clf',   RandomForestClassifier(
                  n_estimators=400,
                  max_depth=None,
                  min_samples_leaf=2,
                  class_weight='balanced',   # handles remaining imbalance
                  random_state=42,
                  n_jobs=-1
    ))
])
from sklearn.metrics import classification_report, roc_auc_score,average_precision_score
# Example fit / evaluate
rf_pipe.fit(X_train, y_train)
y_pred   = rf_pipe.predict(X_test)
y_proba  = rf_pipe.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred, digits=3))
print("ROC‑AUC:", roc_auc_score(y_test, y_proba).round(3))

from sklearn.metrics import classification_report, roc_auc_score,average_precision_score

##validate
# rf_pipe.fit(X_train, y_train)        # should run without NaN error
print("AP:", average_precision_score(y_test, rf_pipe.predict_proba(X_test)[:,1]))

ap = average_precision_score(y_test, y_proba)
print("Average Precision (PR‑AUC):", round(ap, 3))

              precision    recall  f1-score   support

           0      1.000     1.000     1.000      1829
           1      1.000     1.000     1.000       171

    accuracy                          1.000      2000
   macro avg      1.000     1.000     1.000      2000
weighted avg      1.000     1.000     1.000      2000

ROC‑AUC: 1.0
AP: 1.0
Average Precision (PR‑AUC): 1.0
