In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import statistics as st
import scipy

In [3]:
data_descriptions = pd.read_csv('data_descriptions.csv')
pd.set_option('display.max_colwidth', None)
data_descriptions

Unnamed: 0,Column_name,Column_type,Data_type,Description
0,AccountAge,Feature,integer,The age of the user's account in months.
1,MonthlyCharges,Feature,float,The amount charged to the user on a monthly basis.
2,TotalCharges,Feature,float,The total charges incurred by the user over the account's lifetime.
3,SubscriptionType,Feature,object,"The type of subscription chosen by the user (Basic, Standard, or Premium)."
4,PaymentMethod,Feature,string,The method of payment used by the user.
5,PaperlessBilling,Feature,string,Indicates whether the user has opted for paperless billing (Yes or No).
6,ContentType,Feature,string,"The type of content preferred by the user (Movies, TV Shows, or Both)."
7,MultiDeviceAccess,Feature,string,Indicates whether the user has access to the service on multiple devices (Yes or No).
8,DeviceRegistered,Feature,string,"The type of device registered by the user (TV, Mobile, Tablet, or Computer)."
9,ViewingHoursPerWeek,Feature,float,The number of hours the user spends watching content per week.


In [4]:
train_df = pd.read_csv("train.csv")
print('train_df Shape:', train_df.shape)
train_df

train_df Shape: (243787, 21)


Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,...,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled,CustomerID,Churn
0,20,11.055215,221.104302,Premium,Mailed check,No,Both,No,Mobile,36.758104,...,10,Sci-Fi,2.176498,4,Male,3,No,No,CB6SXPNVZA,0
1,57,5.175208,294.986882,Basic,Credit card,Yes,Movies,No,Tablet,32.450568,...,18,Action,3.478632,8,Male,23,No,Yes,S7R2G87O09,0
2,73,12.106657,883.785952,Basic,Mailed check,Yes,Movies,No,Computer,7.395160,...,23,Fantasy,4.238824,6,Male,1,Yes,Yes,EASDC20BDT,0
3,32,7.263743,232.439774,Basic,Electronic check,No,TV Shows,No,Tablet,27.960389,...,30,Drama,4.276013,2,Male,24,Yes,Yes,NPF69NT69N,0
4,57,16.953078,966.325422,Premium,Electronic check,Yes,TV Shows,No,TV,20.083397,...,20,Comedy,3.616170,4,Female,0,No,No,4LGYPK7VOL,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243782,77,9.639902,742.272460,Basic,Mailed check,No,Movies,No,Computer,13.502729,...,47,Sci-Fi,3.697451,1,Male,8,Yes,No,FBZ38J108Z,0
243783,117,13.049257,1526.763053,Premium,Credit card,No,TV Shows,Yes,TV,24.963291,...,35,Comedy,1.449742,4,Male,20,No,No,W4AO1Y6NAI,0
243784,113,14.514569,1640.146267,Premium,Credit card,Yes,TV Shows,No,TV,10.628728,...,44,Action,4.012217,6,Male,13,Yes,Yes,0H3SWWI7IU,0
243785,7,18.140555,126.983887,Premium,Bank transfer,Yes,TV Shows,No,TV,30.466782,...,36,Fantasy,2.135789,7,Female,5,No,Yes,63SJ44RT4A,0


In [5]:
test_df = pd.read_csv("test.csv")
print('test_df Shape:', test_df.shape)
test_df

test_df Shape: (104480, 20)


Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled,CustomerID
0,38,17.869374,679.036195,Premium,Mailed check,No,TV Shows,No,TV,29.126308,122.274031,42,Comedy,3.522724,2,Male,23,No,No,O1W6BHP6RM
1,77,9.912854,763.289768,Basic,Electronic check,Yes,TV Shows,No,TV,36.873729,57.093319,43,Action,2.021545,2,Female,22,Yes,No,LFR4X92X8H
2,5,15.019011,75.095057,Standard,Bank transfer,No,TV Shows,Yes,Computer,7.601729,140.414001,14,Sci-Fi,4.806126,2,Female,22,No,Yes,QM5GBIYODA
3,88,15.357406,1351.451692,Standard,Electronic check,No,Both,Yes,Tablet,35.586430,177.002419,14,Comedy,4.943900,0,Female,23,Yes,Yes,D9RXTK2K9F
4,91,12.406033,1128.949004,Standard,Credit card,Yes,TV Shows,Yes,Tablet,23.503651,70.308376,6,Drama,2.846880,6,Female,0,No,No,ENTCCHR1LR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104475,80,17.348236,1387.858873,Standard,Credit card,No,TV Shows,Yes,Mobile,19.189141,135.445204,35,Comedy,1.411831,7,Female,14,No,Yes,UTKREC613O
104476,20,8.275459,165.509180,Premium,Bank transfer,Yes,Movies,Yes,Mobile,30.986604,114.868640,17,Drama,2.783849,2,Male,8,Yes,No,MDB4E477PS
104477,106,18.134343,1922.240365,Basic,Mailed check,No,Movies,Yes,Computer,7.236303,109.583153,31,Comedy,2.991527,1,Male,12,No,Yes,IPDIA02ZE1
104478,46,19.774010,909.604454,Basic,Bank transfer,No,TV Shows,Yes,TV,25.809285,115.153570,1,Drama,4.998019,0,Female,12,Yes,No,ITLFTPRJGV


In [6]:
null = train_df.isnull().sum().sum()
print('Number of missing values in the training set:', null)
train_df.info()

Number of missing values in the training set: 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243787 entries, 0 to 243786
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   AccountAge                243787 non-null  int64  
 1   MonthlyCharges            243787 non-null  float64
 2   TotalCharges              243787 non-null  float64
 3   SubscriptionType          243787 non-null  object 
 4   PaymentMethod             243787 non-null  object 
 5   PaperlessBilling          243787 non-null  object 
 6   ContentType               243787 non-null  object 
 7   MultiDeviceAccess         243787 non-null  object 
 8   DeviceRegistered          243787 non-null  object 
 9   ViewingHoursPerWeek       243787 non-null  float64
 10  AverageViewingDuration    243787 non-null  float64
 11  ContentDownloadsPerMonth  243787 non-null  int64  
 12  GenrePreference           243787 non-null  object 
 

In [7]:
null = test_df.isnull().sum().sum()
print('Number of missing values in the training set:', null)
test_df.info()

Number of missing values in the training set: 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104480 entries, 0 to 104479
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   AccountAge                104480 non-null  int64  
 1   MonthlyCharges            104480 non-null  float64
 2   TotalCharges              104480 non-null  float64
 3   SubscriptionType          104480 non-null  object 
 4   PaymentMethod             104480 non-null  object 
 5   PaperlessBilling          104480 non-null  object 
 6   ContentType               104480 non-null  object 
 7   MultiDeviceAccess         104480 non-null  object 
 8   DeviceRegistered          104480 non-null  object 
 9   ViewingHoursPerWeek       104480 non-null  float64
 10  AverageViewingDuration    104480 non-null  float64
 11  ContentDownloadsPerMonth  104480 non-null  int64  
 12  GenrePreference           104480 non-null  object 
 

In [8]:
duplicates = train_df.duplicated().sum().sum()
print('Number of duplicates in the training set:', duplicates)

Number of duplicates in the training set: 0


In [9]:
duplicates = test_df.duplicated().sum().sum()
print('Number of duplicates in the test set:', duplicates)

Number of duplicates in the test set: 0


In [10]:
# Splits the data into numerical and categorical variables
train_categorical_vars = train_df.select_dtypes(include=['object']).drop(columns=['CustomerID'])
train_numerical_vars = train_df.select_dtypes(include=['int64', 'float64']).drop(columns=['Churn'])
test_categorical_vars = test_df.select_dtypes(include=['object']).drop(columns=['CustomerID'])
test_numerical_vars = test_df.select_dtypes(include=['int64', 'float64'])

In [11]:
# Correlation between numerical variables and the target variable
for j in train_numerical_vars.columns:
    corrcoef = np.corrcoef(train_numerical_vars[j], train_df['Churn'])[0,1]
    print(f"Correlation between {j} and Churn: {corrcoef}")

Correlation between AccountAge and Churn: -0.19773563199742905
Correlation between MonthlyCharges and Churn: 0.10047280662870756
Correlation between TotalCharges and Churn: -0.12052871074134103
Correlation between ViewingHoursPerWeek and Churn: -0.1286448060396568
Correlation between AverageViewingDuration and Churn: -0.14689656866692882
Correlation between ContentDownloadsPerMonth and Churn: -0.12975187419257536
Correlation between UserRating and Churn: 0.022123581084271208
Correlation between SupportTicketsPerMonth and Churn: 0.08406433075287331
Correlation between WatchlistSize and Churn: 0.021738674917031502


In [12]:
# Stats for numerical variables in the training set
mean = train_numerical_vars.mean()
std = train_numerical_vars.std()
median = train_numerical_vars.median()
mode = train_numerical_vars.mode().iloc[0]
skew = train_numerical_vars.skew()
kurt = train_numerical_vars.kurt()
max = train_numerical_vars.max()
min = train_numerical_vars.min()
q3 = train_numerical_vars.quantile(0.75) 
q1 = train_numerical_vars.quantile(0.25)
iqr = q3 - q1
num_stats = pd.DataFrame(
    {'mean': mean, 'std': std, 'median': median, 'mode': mode, 'skew': skew, 'kurt': kurt, 'max': max, 'min': min, 'iqr': iqr}
)
num_stats

Unnamed: 0,mean,std,median,mode,skew,kurt,max,min,iqr
AccountAge,60.083758,34.285143,60.0,93.0,-0.002506,-1.199282,119.0,1.0,60.0
MonthlyCharges,12.490695,4.327615,12.495555,4.990062,-0.003584,-1.201509,19.989957,4.990062,7.499617
TotalCharges,750.741017,523.073273,649.878487,4.991154,0.694068,-0.262047,2378.723844,4.991154,760.170335
ViewingHoursPerWeek,20.502179,11.243753,20.523116,1.000065,-0.00134,-1.199817,39.999723,1.000065,19.455443
AverageViewingDuration,92.264061,50.505243,92.249992,5.000547,0.002758,-1.2009,179.999275,5.000547,87.525653
ContentDownloadsPerMonth,24.503513,14.421174,24.0,17.0,-0.000427,-1.201353,49.0,0.0,25.0
UserRating,3.002713,1.155259,3.002261,1.000007,-0.000958,-1.201812,4.999989,1.000007,2.001304
SupportTicketsPerMonth,4.504186,2.872548,4.0,7.0,-0.000896,-1.225538,9.0,0.0,5.0
WatchlistSize,12.018508,7.193034,12.0,16.0,-0.0045,-1.199512,24.0,0.0,12.0


In [13]:
# Stats for numerical variables in the testing set
mean = test_numerical_vars.mean()
std = test_numerical_vars.std()
median = test_numerical_vars.median()
mode = test_numerical_vars.mode().iloc[0]
skew = test_numerical_vars.skew()
kurt = test_numerical_vars.kurt()
max = test_numerical_vars.max()
min = test_numerical_vars.min()
q3 = test_numerical_vars.quantile(0.75)
q1 = test_numerical_vars.quantile(0.25)
iqr = q3 - q1

num_stats = pd.DataFrame(
    {'mean': mean, 'std': std, 'median': median, 'mode': mode, 'skew': skew, 'kurt': kurt, 'max': max, 'min': min, 'iqr': iqr}
)

num_stats

Unnamed: 0,mean,std,median,mode,skew,kurt,max,min,iqr
AccountAge,60.064692,34.285025,60.0,55.0,-0.004098,-1.197787,119.0,1.0,60.0
MonthlyCharges,12.474347,4.331734,12.453073,4.990051,0.002177,-1.197063,19.989797,4.990051,7.488626
TotalCharges,748.167669,520.782838,649.385029,5.019144,0.700727,-0.235782,2376.235183,5.019144,752.305448
ViewingHoursPerWeek,20.489914,11.243173,20.472305,1.000528,0.001338,-1.194886,39.999296,1.000528,19.428556
AverageViewingDuration,92.646128,50.631406,92.533168,5.000985,-0.000561,-1.206439,179.999785,5.000985,88.067953
ContentDownloadsPerMonth,24.4509,14.451309,25.0,4.0,2e-05,-1.203933,49.0,0.0,25.0
UserRating,3.000958,1.154689,2.997293,1.000016,0.001929,-1.2011,4.99993,1.000016,2.000094
SupportTicketsPerMonth,4.507705,2.8767,5.0,0.0,-0.009551,-1.227612,9.0,0.0,5.0
WatchlistSize,12.0404,7.204115,12.0,4.0,-0.00771,-1.201794,24.0,0.0,12.0


In [14]:
# Absolute and relative frequency of categorical variables in the training set
for j in train_categorical_vars.columns:
    print(f"--- {j} ---")
    print(train_df[j].value_counts())
    print("Percentage:")
    print(train_df[j].value_counts(normalize=True) * 100)
    print("\n")

--- SubscriptionType ---
SubscriptionType
Standard    81920
Basic       81050
Premium     80817
Name: count, dtype: int64
Percentage:
SubscriptionType
Standard    33.603104
Basic       33.246235
Premium     33.150660
Name: proportion, dtype: float64


--- PaymentMethod ---
PaymentMethod
Electronic check    61313
Credit card         60924
Bank transfer       60797
Mailed check        60753
Name: count, dtype: int64
Percentage:
PaymentMethod
Electronic check    25.150234
Credit card         24.990668
Bank transfer       24.938573
Mailed check        24.920525
Name: proportion, dtype: float64


--- PaperlessBilling ---
PaperlessBilling
No     121980
Yes    121807
Name: count, dtype: int64
Percentage:
PaperlessBilling
No     50.035482
Yes    49.964518
Name: proportion, dtype: float64


--- ContentType ---
ContentType
Both        81737
TV Shows    81145
Movies      80905
Name: count, dtype: int64
Percentage:
ContentType
Both        33.528039
TV Shows    33.285204
Movies      33.186757
Name:

In [15]:
# Absolute and relative frequency of categorical variables in the testing set
for j in test_categorical_vars.columns:
    print(f"--- {j} ---")
    print(test_df[j].value_counts())
    print("Percentage:")
    print(test_df[j].value_counts(normalize=True) * 100)
    print("\n")

--- SubscriptionType ---
SubscriptionType
Basic       34956
Standard    34833
Premium     34691
Name: count, dtype: int64
Percentage:
SubscriptionType
Basic       33.457121
Standard    33.339395
Premium     33.203484
Name: proportion, dtype: float64


--- PaymentMethod ---
PaymentMethod
Mailed check        26166
Credit card         26134
Electronic check    26127
Bank transfer       26053
Name: count, dtype: int64
Percentage:
PaymentMethod
Mailed check        25.044028
Credit card         25.013400
Electronic check    25.006700
Bank transfer       24.935873
Name: proportion, dtype: float64


--- PaperlessBilling ---
PaperlessBilling
No     52564
Yes    51916
Name: count, dtype: int64
Percentage:
PaperlessBilling
No     50.310107
Yes    49.689893
Name: proportion, dtype: float64


--- ContentType ---
ContentType
Both        35087
Movies      34848
TV Shows    34545
Name: count, dtype: int64
Percentage:
ContentType
Both        33.582504
Movies      33.353752
TV Shows    33.063744
Name: p

In [16]:
# Chi-square test for independence between categorical variables and the target variable
results = []
for j in train_categorical_vars:
    contingency_table = pd.crosstab(train_categorical_vars[j], train_df['Churn'])
    chi2, p, dof, ex = scipy.stats.chi2_contingency(contingency_table)
    results.append({
        'Variable': j,
        'Chi2': chi2,
        'p-value': p,
    })
results_df = pd.DataFrame(results).set_index('Variable').T
results_df

Variable,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,GenrePreference,Gender,ParentalControl,SubtitlesEnabled
Chi2,318.4494,239.8638,0.202776,43.454,0.396155,2.839135,169.2967,13.554239,7.429245,37.25163
p-value,7.072955999999999e-70,1.018485e-51,0.652489,3.665087e-10,0.529082,0.417098,1.480532e-35,0.000232,0.006417,1.03828e-09


In [17]:
# Rate of churn for each category of categorical variables
for col in train_categorical_vars.columns:
    churn_rate = train_df.groupby(col)['Churn'].mean()
    print(f"Tasa de churn por {col}:")
    print(churn_rate)
    print("\n")

Tasa de churn por SubscriptionType:
SubscriptionType
Basic       0.196521
Premium     0.162775
Standard    0.184314
Name: Churn, dtype: float64


Tasa de churn por PaymentMethod:
PaymentMethod
Bank transfer       0.179269
Credit card         0.162251
Electronic check    0.192471
Mailed check        0.190888
Name: Churn, dtype: float64


Tasa de churn por PaperlessBilling:
PaperlessBilling
No     0.181587
Yes    0.180876
Name: Churn, dtype: float64


Tasa de churn por ContentType:
ContentType
Both        0.188470
Movies      0.177764
TV Shows    0.177398
Name: Churn, dtype: float64


Tasa de churn por MultiDeviceAccess:
MultiDeviceAccess
No     0.181727
Yes    0.180736
Name: Churn, dtype: float64


Tasa de churn por DeviceRegistered:
DeviceRegistered
Computer    0.181350
Mobile      0.182372
TV          0.179044
Tablet      0.182147
Name: Churn, dtype: float64


Tasa de churn por GenrePreference:
GenrePreference
Action     0.165866
Comedy     0.193416
Drama      0.178709
Fantasy    0.17

##### Cleaning the data

In [20]:
# Standardization
# Analysis 

for j in train_numerical_vars.columns:
    print(f"{j}: Min = {train_numerical_vars[j].min()}, Max = {train_numerical_vars[j].max()}")

for j in test_numerical_vars.columns:
    print(f"{j}: Min = {test_numerical_vars[j].min()}, Max = {test_numerical_vars[j].max()}")

AccountAge: Min = 1, Max = 119
MonthlyCharges: Min = 4.990061546582933, Max = 19.989956867323198
TotalCharges: Min = 4.991154354368685, Max = 2378.723844106135
ViewingHoursPerWeek: Min = 1.000065389097874, Max = 39.99972313614872
AverageViewingDuration: Min = 5.0005474861951456, Max = 179.99927511771548
ContentDownloadsPerMonth: Min = 0, Max = 49
UserRating: Min = 1.0000073778257992, Max = 4.999989412151973
SupportTicketsPerMonth: Min = 0, Max = 9
WatchlistSize: Min = 0, Max = 24
AccountAge: Min = 1, Max = 119
MonthlyCharges: Min = 4.99005093760967, Max = 19.9897968072357
TotalCharges: Min = 5.019143887697532, Max = 2376.2351831192104
ViewingHoursPerWeek: Min = 1.000527715302754, Max = 39.99929642968343
AverageViewingDuration: Min = 5.000984528483458, Max = 179.99978525468714
ContentDownloadsPerMonth: Min = 0, Max = 49
UserRating: Min = 1.0000162379660091, Max = 4.999929926415726
SupportTicketsPerMonth: Min = 0, Max = 9
WatchlistSize: Min = 0, Max = 24


In [21]:
# Standardization
scaler = StandardScaler()
train_df[train_numerical_vars.columns] = scaler.fit_transform(train_numerical_vars)

test_df[test_numerical_vars.columns] = scaler.fit_transform(test_numerical_vars)

In [22]:
train_df.head(5)

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,...,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled,CustomerID,Churn
0,-1.169131,-0.331703,-1.01255,Premium,Mailed check,No,Both,No,Mobile,1.445777,...,-1.005712,Sci-Fi,-0.715179,-0.175519,Male,-1.253786,No,No,CB6SXPNVZA,0
1,-0.089945,-1.690423,-0.871303,Basic,Credit card,Yes,Movies,No,Tablet,1.062671,...,-0.450971,Action,0.41196,1.216976,Male,1.526687,No,Yes,S7R2G87O09,0
2,0.376731,-0.088741,0.254353,Basic,Mailed check,Yes,Movies,No,Computer,-1.165718,...,-0.104258,Fantasy,1.069988,0.520728,Male,-1.531833,Yes,Yes,EASDC20BDT,0
3,-0.819125,-1.207816,-0.990879,Basic,Electronic check,No,TV Shows,No,Tablet,0.663322,...,0.381141,Drama,1.102179,-0.871766,Male,1.665711,Yes,Yes,NPF69NT69N,0
4,-0.089945,1.031143,0.41215,Premium,Electronic check,Yes,TV Shows,No,TV,-0.037246,...,-0.312285,Comedy,0.531014,-0.175519,Female,-1.670857,No,No,4LGYPK7VOL,0


In [23]:
# Categorical encoding

le = LabelEncoder()
train_df['SubscriptionType'] = le.fit_transform(train_df['SubscriptionType'])
test_df['SubscriptionType'] = le.fit_transform(test_df['SubscriptionType'])

encoder = OneHotEncoder(sparse_output=False) 
for j in train_categorical_vars.columns:
    train_encoded = encoder.fit_transform(train_df[[j]])
    train_encoded = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out([j]))
    train_df = pd.concat([train_df, train_encoded], axis=1)
    train_df.drop(columns=[j], inplace=True)

for j in test_categorical_vars.columns:
    test_encoded = encoder.fit_transform(test_df[[j]])
    test_encoded = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out([j]))
    test_df = pd.concat([test_df, test_encoded], axis=1)
    test_df.drop(columns=[j], inplace=True)

In [24]:
train_df.head(5)

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,UserRating,SupportTicketsPerMonth,WatchlistSize,CustomerID,...,GenrePreference_Comedy,GenrePreference_Drama,GenrePreference_Fantasy,GenrePreference_Sci-Fi,Gender_Female,Gender_Male,ParentalControl_No,ParentalControl_Yes,SubtitlesEnabled_No,SubtitlesEnabled_Yes
0,-1.169131,-0.331703,-1.01255,1.445777,-0.568906,-1.005712,-0.715179,-0.175519,-1.253786,CB6SXPNVZA,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,-0.089945,-1.690423,-0.871303,1.062671,-1.317459,-0.450971,0.41196,1.216976,1.526687,S7R2G87O09,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2,0.376731,-0.088741,0.254353,-1.165718,-0.691019,-0.104258,1.069988,0.520728,-1.531833,EASDC20BDT,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,-0.819125,-1.207816,-0.990879,0.663322,0.777613,0.381141,1.102179,-0.871766,1.665711,NPF69NT69N,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,-0.089945,1.031143,0.41215,-0.037246,-0.928765,-0.312285,0.531014,-0.175519,-1.670857,4LGYPK7VOL,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [25]:
summary_stats = train_df.describe()
summary_stats.loc[["min", "max"]]

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,UserRating,SupportTicketsPerMonth,WatchlistSize,Churn,...,GenrePreference_Comedy,GenrePreference_Drama,GenrePreference_Fantasy,GenrePreference_Sci-Fi,Gender_Female,Gender_Male,ParentalControl_No,ParentalControl_Yes,SubtitlesEnabled_No,SubtitlesEnabled_Yes
min,-1.723308,-1.733206,-1.425711,-1.734488,-1.727815,-1.699138,-1.733559,-1.568014,-1.670857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.718423,1.732889,3.112348,1.734081,1.737154,1.698651,1.72886,1.565099,1.665711,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
summary_stats = test_df.describe()
summary_stats.loc[["min", "max"]]

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,UserRating,SupportTicketsPerMonth,WatchlistSize,SubscriptionType_0,...,GenrePreference_Comedy,GenrePreference_Drama,GenrePreference_Fantasy,GenrePreference_Sci-Fi,Gender_Female,Gender_Male,ParentalControl_No,ParentalControl_Yes,SubtitlesEnabled_No,SubtitlesEnabled_Yes
min,-1.722763,-1.727791,-1.42699,-1.73345,-1.731051,-1.691959,-1.732893,-1.566978,-1.671331,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.718989,1.734983,3.126208,1.735229,1.725294,1.698754,1.731186,1.561622,1.660115,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
train_df.head(5)

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,UserRating,SupportTicketsPerMonth,WatchlistSize,CustomerID,...,GenrePreference_Comedy,GenrePreference_Drama,GenrePreference_Fantasy,GenrePreference_Sci-Fi,Gender_Female,Gender_Male,ParentalControl_No,ParentalControl_Yes,SubtitlesEnabled_No,SubtitlesEnabled_Yes
0,-1.169131,-0.331703,-1.01255,1.445777,-0.568906,-1.005712,-0.715179,-0.175519,-1.253786,CB6SXPNVZA,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,-0.089945,-1.690423,-0.871303,1.062671,-1.317459,-0.450971,0.41196,1.216976,1.526687,S7R2G87O09,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2,0.376731,-0.088741,0.254353,-1.165718,-0.691019,-0.104258,1.069988,0.520728,-1.531833,EASDC20BDT,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,-0.819125,-1.207816,-0.990879,0.663322,0.777613,0.381141,1.102179,-0.871766,1.665711,NPF69NT69N,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,-0.089945,1.031143,0.41215,-0.037246,-0.928765,-0.312285,0.531014,-0.175519,-1.670857,4LGYPK7VOL,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [28]:
test_df.head(5)

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,UserRating,SupportTicketsPerMonth,WatchlistSize,CustomerID,...,GenrePreference_Comedy,GenrePreference_Drama,GenrePreference_Fantasy,GenrePreference_Sci-Fi,Gender_Female,Gender_Male,ParentalControl_No,ParentalControl_Yes,SubtitlesEnabled_No,SubtitlesEnabled_Yes
0,-0.64357,1.245472,-0.132746,0.768149,0.585171,1.214366,0.451869,-0.871734,1.521304,O1W6BHP6RM,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,0.493959,-0.591335,0.029037,1.45723,-0.702192,1.283565,-0.848209,-0.871734,1.382494,LFR4X92X8H,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,-1.606093,0.58745,-1.292431,-1.146318,0.943448,-0.723184,1.563345,-0.871734,1.382494,QM5GBIYODA,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
3,0.8148,0.66557,1.158423,1.342734,1.666094,-0.723184,1.682662,-1.566978,1.521304,D9RXTK2K9F,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,0.902302,-0.015771,0.731175,0.268052,-0.441186,-1.276769,-0.133438,0.518755,-1.671331,ENTCCHR1LR,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [29]:
train_df.to_csv('train_cleaned.csv', index=False)
test_df.to_csv('test_cleaned.csv', index=False)