In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

In [11]:
train_df = pd.read_csv("../Data/train.csv")
test_df = pd.read_csv("../Data/test.csv")
meta = pd.read_csv("../Data/data_descriptions.csv")

In [12]:
meta

Unnamed: 0,Column_name,Column_type,Data_type,Description
0,AccountAge,Feature,integer,The age of the user's account in months.
1,MonthlyCharges,Feature,float,The amount charged to the user on a monthly ba...
2,TotalCharges,Feature,float,The total charges incurred by the user over th...
3,SubscriptionType,Feature,object,The type of subscription chosen by the user (B...
4,PaymentMethod,Feature,string,The method of payment used by the user.
5,PaperlessBilling,Feature,string,Indicates whether the user has opted for paper...
6,ContentType,Feature,string,The type of content preferred by the user (Mov...
7,MultiDeviceAccess,Feature,string,Indicates whether the user has access to the s...
8,DeviceRegistered,Feature,string,"The type of device registered by the user (TV,..."
9,ViewingHoursPerWeek,Feature,float,The number of hours the user spends watching c...


In [13]:
y_train = train_df['Churn']
train_df = train_df.drop(columns=['Churn'])

In [14]:
train_ids = train_df['CustomerID']
test_ids = test_df['CustomerID']
train_df = train_df.drop(columns=['CustomerID'])
test_df = test_df.drop(columns=['CustomerID'])

In [15]:
#ordinal encoding for SubscriptionType
sub_mapping = {'Basic':0, 'Standard':1, 'Premium':2}
train_df['SubscriptionType']=train_df['SubscriptionType'].map(sub_mapping)
test_df['SubscriptionType']= test_df['SubscriptionType'].map(sub_mapping)

In [16]:
#FE (behavioral features)

def features(df):
    df=df.copy()
    # eficiency: Cost per hour of viewing
    df['CostPerHour'] = df['MonthlyCharges'] / (df['ViewingHoursPerWeek'] + 0.01) # to avoid division by zero
    # complains intensity
    df['ComplaintIntensity'] = df['SupportTicketsPerMonth'] / df['MonthlyCharges']
    # engagement score
    df['EngagementScore'] = df['AverageViewingDuration'] * df['ViewingHoursPerWeek']
    # ofline download efficiency
    df['PricePerDownload'] = df['MonthlyCharges'] / (df['ContentDownloadsPerMonth'] + 1)
    
    #non linear transformations
    #logs
    df['Log_TotalCharges'] = np.log1p(df['TotalCharges'])
    df['Log_AverageViewingDuration'] = np.log1p(df['AverageViewingDuration'])
    #intersection ltv proxy
    df['Age_x_Charges'] = df['AccountAge'] * df['MonthlyCharges']
    
    #risk segmentation (binary)
    # high risk for users with many complains and low engagement
    #aproximate quantiles from original data set
    df['HighRisk_Flag'] = ((df['SupportTicketsPerMonth'] > 7) & 
                           (df['ViewingHoursPerWeek'] < 10)).astype(int)
    
    #binning
    df['TenureGroup'] = pd.cut(df['AccountAge'], bins=[-1, 12, 24, 48, 200], labels=[0, 1, 2, 3]).astype(int)
    return df

train_df = features(train_df)
test_df = features(test_df)

cols prep

In [17]:
cat_cols = train_df.select_dtypes(include=['object']).columns.tolist()
encoder = ce.MEstimateEncoder(cols=cat_cols, m=5.0)

In [18]:
train_df = encoder.fit_transform(train_df, y_train)
test_df = encoder.transform(test_df)

In [19]:
cols_to_scale = train_df.columns
scaler = StandardScaler()

In [None]:
train_df[cols_to_scale] = scaler.fit_transform(train_df[cols_to_scale])
test_df[cols_to_scale] = scaler.transform(test_df[cols_to_scale])

In [21]:
# reconstruct train and test with ids and target
train_df["CustomerID"] = train_ids.values
train_df["Churn"] = y_train.values
test_df['CustomerID'] = test_ids.values

In [22]:
train_df.shape, test_df.shape

((243787, 30), (104480, 29))

In [23]:
train_df.to_csv('../Data/train_cleaned.csv', index=False)
test_df.to_csv('../Data/test_cleaned.csv', index=False)