In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
train_df = pd.read_csv("../Data/train.csv")
test_df = pd.read_csv("../Data/test.csv")
meta = pd.read_csv("../Data/data_descriptions.csv")

In [3]:
meta

Unnamed: 0,Column_name,Column_type,Data_type,Description
0,AccountAge,Feature,integer,The age of the user's account in months.
1,MonthlyCharges,Feature,float,The amount charged to the user on a monthly ba...
2,TotalCharges,Feature,float,The total charges incurred by the user over th...
3,SubscriptionType,Feature,object,The type of subscription chosen by the user (B...
4,PaymentMethod,Feature,string,The method of payment used by the user.
5,PaperlessBilling,Feature,string,Indicates whether the user has opted for paper...
6,ContentType,Feature,string,The type of content preferred by the user (Mov...
7,MultiDeviceAccess,Feature,string,Indicates whether the user has access to the s...
8,DeviceRegistered,Feature,string,"The type of device registered by the user (TV,..."
9,ViewingHoursPerWeek,Feature,float,The number of hours the user spends watching c...


In [4]:
y_train = train_df['Churn']
train_df = train_df.drop(columns=['Churn'])

In [5]:
train_ids = train_df['CustomerID']
test_ids = test_df['CustomerID']
train_df = train_df.drop(columns=['CustomerID'])
test_df = test_df.drop(columns=['CustomerID'])

In [6]:
#ordinal encoding for SubscriptionType
sub_mapping = {'Basic':0, 'Standard':1, 'Premium':2}
train_df['SubscriptionType']=train_df['SubscriptionType'].map(sub_mapping)
test_df['SubscriptionType']= test_df['SubscriptionType'].map(sub_mapping)

In [7]:
#FE (behavioral features)

def features(df):
    df=df.copy()
    # eficiency: Cost per hour of viewing
    df['CostPerHour'] = df['MonthlyCharges'] / (df['ViewingHoursPerWeek'] + 0.01) # to avoid division by zero
    # complains intensity
    df['ComplaintIntensity'] = df['SupportTicketsPerMonth'] / df['MonthlyCharges']
    # engagement score
    df['EngagementScore'] = df['AverageViewingDuration'] * df['ViewingHoursPerWeek']
    
    return df

train_df = features(train_df)
test_df = features(test_df)

cols prep

In [8]:
#num and cat cols
cat_cols = train_df.select_dtypes(include=['object']).columns
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns

In [9]:
# scaling (fit in train, transform in test avoiding leakeage)
scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

In [10]:
# ONE-HOT ENCODING
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(train_df[cat_cols])

train_encoded=pd.DataFrame(
    encoder.transform(train_df[cat_cols]), 
    columns=encoder.get_feature_names_out(cat_cols))

train_df =pd.concat([train_df.drop(columns=cat_cols), train_encoded],axis=1)

In [11]:
test_encoded = pd.DataFrame(
    encoder.transform(test_df[cat_cols]), 
    columns=encoder.get_feature_names_out(cat_cols))
test_df = pd.concat([test_df.drop(columns=cat_cols), test_encoded], axis=1)

In [12]:
# reconstruct train and test with ids and target
train_df["CustomerID"] = train_ids.values
train_df["Churn"] = y_train.values
test_df['CustomerID'] = test_ids.values

In [13]:
train_df.shape, test_df.shape

((243787, 41), (104480, 40))

In [14]:
train_df.to_csv('../Data/train_cleaned.csv', index=False)
test_df.to_csv('../Data/test_cleaned.csv', index=False)