**1. Importing the dependencies**

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

**2. Data Loading and Understanding**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Kiểm tra đường dẫn thực tế của thư mục dataset/train
!ls "/content/drive/My Drive/Churn Prediction/dataset/"



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
data_descriptions.csv  test.csv  train.csv


In [4]:
data_descriptions = pd.read_csv('/content/drive/My Drive/Churn Prediction/dataset/data_descriptions.csv')
pd.set_option('display.max_colwidth', None)
data_descriptions

Unnamed: 0,Column_name,Column_type,Data_type,Description
0,AccountAge,Feature,integer,The age of the user's account in months.
1,MonthlyCharges,Feature,float,The amount charged to the user on a monthly basis.
2,TotalCharges,Feature,float,The total charges incurred by the user over the account's lifetime.
3,SubscriptionType,Feature,object,"The type of subscription chosen by the user (Basic, Standard, or Premium)."
4,PaymentMethod,Feature,string,The method of payment used by the user.
5,PaperlessBilling,Feature,string,Indicates whether the user has opted for paperless billing (Yes or No).
6,ContentType,Feature,string,"The type of content preferred by the user (Movies, TV Shows, or Both)."
7,MultiDeviceAccess,Feature,string,Indicates whether the user has access to the service on multiple devices (Yes or No).
8,DeviceRegistered,Feature,string,"The type of device registered by the user (TV, Mobile, Tablet, or Computer)."
9,ViewingHoursPerWeek,Feature,float,The number of hours the user spends watching content per week.


In [5]:
train_df = pd.read_csv("/content/drive/My Drive/Churn Prediction/dataset/train.csv")
print('train_df Shape:', train_df.shape)
test_df = pd.read_csv("/content/drive/My Drive/Churn Prediction/dataset/test.csv")
print('test_df Shape:', test_df.shape)

train_df Shape: (243787, 21)
test_df Shape: (104480, 20)


In [6]:
null_count = train_df.isnull().sum()
duplicate_count = train_df.duplicated().sum()
# No missing or duplicate values found

# =============================
# Observing unique values
unique_values = train_df.nunique()
print("Table variables and the number of their unique values:")
print(unique_values)
print("\nDataframe shape:", train_df.shape)

# Categorical feature encoding (one-hot encoding)
columns = ["SubscriptionType", "PaymentMethod", "PaperlessBilling", "ContentType",
           "MultiDeviceAccess", "DeviceRegistered", "GenrePreference", "Gender",
           "ParentalControl", "SubtitlesEnabled"]
train_df = pd.get_dummies(train_df, columns=columns)
test_df = pd.get_dummies(test_df, columns=columns)

xtrain = train_df[train_df.columns.drop(["CustomerID", "Churn"])]
ytrain = train_df["Churn"]

xtest = test_df[test_df.columns.drop(["CustomerID"])]

Table variables and the number of their unique values:
AccountAge                     119
MonthlyCharges              243787
TotalCharges                243787
SubscriptionType                 3
PaymentMethod                    4
PaperlessBilling                 2
ContentType                      3
MultiDeviceAccess                2
DeviceRegistered                 4
ViewingHoursPerWeek         243787
AverageViewingDuration      243787
ContentDownloadsPerMonth        50
GenrePreference                  5
UserRating                  243787
SupportTicketsPerMonth          10
Gender                           2
WatchlistSize                   25
ParentalControl                  2
SubtitlesEnabled                 2
CustomerID                  243787
Churn                            2
dtype: int64

Dataframe shape: (243787, 21)


In [7]:
model = RidgeClassifier()
scores = cross_validate(model, xtrain, ytrain, scoring=["roc_auc"], n_jobs=-1)
print("Cross validation ROC AUC: mean={0}, std={1}".format(np.mean(scores['test_roc_auc']), np.std(scores['test_roc_auc'])))

Cross validation ROC AUC: mean=0.7493729539102307, std=0.004512814739921477


In [8]:
model = LogisticRegression(n_jobs=-1, C=1/9)
scores = cross_validate(model, xtrain, ytrain, scoring=["roc_auc"], n_jobs=-1)
print("Cross validation ROC AUC: mean={0}, std={1}".format(np.mean(scores['test_roc_auc']), np.std(scores['test_roc_auc'])))

Cross validation ROC AUC: mean=0.7476563803909656, std=0.0035201062471400097


In [9]:
model = DecisionTreeClassifier()
scores = cross_validate(model, xtrain, ytrain, scoring=["roc_auc"], n_jobs=-1)
print("Cross validation ROC AUC: mean={0}, std={1}".format(np.mean(scores['test_roc_auc']), np.std(scores['test_roc_auc'])))

Cross validation ROC AUC: mean=0.5602388487737329, std=0.002377478353007538


In [10]:
model = RandomForestClassifier(n_estimators=150, n_jobs=-1)
scores = cross_validate(model, xtrain, ytrain, scoring=["roc_auc"], n_jobs=-1)
print("Cross validation ROC AUC: mean={0}, std={1}".format(np.mean(scores['test_roc_auc']), np.std(scores['test_roc_auc'])))



Cross validation ROC AUC: mean=0.7319057890945222, std=0.004463812277019149


In [11]:
model = BaggingClassifier(n_estimators=150, n_jobs=-1)
scores = cross_validate(model, xtrain, ytrain, scoring=["roc_auc"], n_jobs=-1)
print("Cross validation ROC AUC: mean={0}, std={1}".format(np.mean(scores['test_roc_auc']), np.std(scores['test_roc_auc'])))

KeyboardInterrupt: 

In [12]:
model = MLPClassifier(hidden_layer_sizes=(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ), max_iter=400, early_stopping=True, validation_fraction=0.1)
scores = cross_validate(model, xtrain, ytrain, scoring=["roc_auc"], n_jobs=-1)
print("Cross validation ROC AUC: mean={0}, std={1}".format(np.mean(scores['test_roc_auc']), np.std(scores['test_roc_auc'])))


Cross validation ROC AUC: mean=0.7493899220109864, std=0.004431584868550593


In [13]:
model = MLPClassifier(hidden_layer_sizes=(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ), max_iter=400, early_stopping=True, validation_fraction=0.1) # best
# model = MLPClassifier(hidden_layer_sizes=(256, 256, 256, ), max_iter=400, early_stopping=True, validation_fraction=0.1) # same performance as above
model.fit(xtrain, ytrain)
joblib.dump(model, 'NN.sav')
y_score = model.predict_proba(xtrain)[:, 1]
prediction_df = pd.DataFrame(model.predict_proba(xtest))
prediction_df[0] = test_df["CustomerID"]
prediction_df.rename(columns={0:"CustomerID", 1:"PredictionProbability"}, inplace=True)
preds = model.predict(xtrain)
print("ROC AUC =", roc_auc_score(ytrain, y_score))
print("Accuracy =", accuracy_score(ytrain, preds))

ROC AUC = 0.7500516106336405
Accuracy = 0.8241128526131418
