In [1]:
import pandas as pdas  # CSV files In/Output, data processing (e.g. pdas.read_csv)
import numpy as npy  # Evaluating mathematical lib, linear algebra
import matplotlib.pyplot as matplot_plt  # Draw chart lib
import plotly.express as plotly_ex  # Draw chart lib

# For Saving best model
import joblib

In [2]:
# Auto detect-width to set data to fit the width of the terminal
pdas.set_option("display.max_columns", None)
# Input the dataset
dts_dtframe = pdas.read_csv("../data/churn.csv")

In [3]:
# Get overview of the data
def dataoveriew(dataframe, message):
    print(f"{message}:")
    print("Tổng số hàng: ", dataframe.shape[0])
    print("Tổng số thuộc tính:", dataframe.shape[1])
    print("Các thuộc tính:")
    print(dataframe.columns.tolist())
    print("Số kiểu giá trị của từng thuộc tính:")
    print(dataframe.nunique())
    print("Giá trị null:", dataframe.isnull().sum().values.sum())


dataoveriew(dts_dtframe, "Tổng quan về dataset:")

Tổng quan về dataset::
Tổng số hàng:  7043
Tổng số thuộc tính: 21
Các thuộc tính:
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
Số kiểu giá trị của từng thuộc tính:
customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64
Giá trị null: 0


### Explore Target variable

In [26]:
# Return the Object in descending order containing counts of unique values
churn_instance = dts_dtframe["Churn"].value_counts()
# Convert series, object to DataFrame
churn_instance = churn_instance.to_frame()
# Reset index of the DataFrame
churn_instance = churn_instance.reset_index()
# Rename the column 'index'
churn_instance = churn_instance.rename(columns={"index": "Category"})
# Use plotly.express lib to draw the chart
churn_pie_chart = plotly_ex.pie(
    churn_instance,
    values="Churn",
    names="Category",
    color_discrete_sequence=["blue", "red"],
    title="Distribution of Churn",
    width=800
)
churn_pie_chart.show()

### Explore Categorical features

In [97]:
# Defining bar chart function
def bar_chart(feature, df=dts_dtframe):
    # Groupby the categorical feature
    temp_df = df.groupby([feature, "Churn"]).size().reset_index()
    print(temp_df)
    temp_df = temp_df.rename(columns={0: "Count"})
    print(temp_df)
    # Calculate the value counts of each distribution and it's corresponding Percentages
    value_counts_df = df[feature].value_counts().to_frame().reset_index()
    print(value_counts_df)
    categories = [cat[1][0] for cat in value_counts_df.iterrows()]
    print(categories)
    # Calculate the value counts of each distribution and it's corresponding Percentages
    num_list = [num[1][1] for num in value_counts_df.iterrows()]
    print(num_list)
    div_list = [element / sum(num_list) for element in num_list]
    print(div_list)
    percentage = [round(element * 100, 1) for element in div_list]
    print(percentage)
    # Defining string formatting for graph annotation
    # Numeric section
    def num_format(list_instance):
        print(list_instance)
        print(len(list_instance))
        formatted_str = ""
        for index, num in enumerate(list_instance):
            if index < len(list_instance) - 2:
                formatted_str = (
                    formatted_str + f"{num}%, "
                )  # append to empty string(formatted_str)
            elif index == len(list_instance) - 2:
                formatted_str = formatted_str + f"{num}% & "
            else:
                formatted_str = formatted_str + f"{num}%"
        return formatted_str

    # Categorical section
    def str_format(list_instance):
        print(list_instance)
        print(len(list_instance))
        formatted_str = ""
        for index, cat in enumerate(list_instance):
            if index < len(list_instance) - 2:
                formatted_str = formatted_str + f"{cat}, "
            elif index == len(list_instance) - 2:
                formatted_str = formatted_str + f"{cat} & "
            else:
                formatted_str = formatted_str + f"{cat}"
        return formatted_str

    # Running the formatting functions
    num_str = num_format(percentage)
    cat_str = str_format(categories)

    # Setting graph framework
    chart = plotly_ex.bar(
        temp_df,
        x=feature,
        y="Count",
        color="Churn",
        title=f"Churn rate by {feature}",
        barmode="group",
        color_discrete_sequence=["blue", "red"],
        width=900
    )
    chart.add_annotation(
        text=f"Value count of distribution of {cat_str} are<br>{num_str} percentage respectively.",
        align="left",
        showarrow=False,
        xref="paper",
        yref="paper",
        x=1,
        y=1.3,
        bordercolor="black",
        borderwidth=1
    )

    return chart.show()

In [76]:
# Gender feature plot
bar_chart("gender")
# SeniorCitizen feature plot
dts_dtframe.loc[
    dts_dtframe.SeniorCitizen == 0, "SeniorCitizen"
] = "No"  # convert 0 to No in all data instances
dts_dtframe.loc[
    dts_dtframe.SeniorCitizen == 1, "SeniorCitizen"
] = "Yes"  # convert 1 to Yes in all data instances
bar_chart("SeniorCitizen")
# Partner feature plot
bar_chart("Partner")
# Dependents feature plot
bar_chart("Dependents")

   gender Churn     0
0  Female    No  2549
1  Female   Yes   939
2    Male    No  2625
3    Male   Yes   930
   gender Churn  Count
0  Female    No   2549
1  Female   Yes    939
2    Male    No   2625
3    Male   Yes    930
    index  gender
0    Male    3555
1  Female    3488
['Male', 'Female']
[3555, 3488]
[0.504756495811444, 0.495243504188556]
[50.5, 49.5]
[50.5, 49.5]
2
['Male', 'Female']
2


  SeniorCitizen Churn     0
0            No    No  4508
1            No   Yes  1393
2           Yes    No   666
3           Yes   Yes   476
  SeniorCitizen Churn  Count
0            No    No   4508
1            No   Yes   1393
2           Yes    No    666
3           Yes   Yes    476
  index  SeniorCitizen
0    No           5901
1   Yes           1142
['No', 'Yes']
[5901, 1142]
[0.8378531875621185, 0.1621468124378816]
[83.8, 16.2]
[83.8, 16.2]
2
['No', 'Yes']
2


  Partner Churn     0
0      No    No  2441
1      No   Yes  1200
2     Yes    No  2733
3     Yes   Yes   669
  Partner Churn  Count
0      No    No   2441
1      No   Yes   1200
2     Yes    No   2733
3     Yes   Yes    669
  index  Partner
0    No     3641
1   Yes     3402
['No', 'Yes']
[3641, 3402]
[0.5169672014766434, 0.4830327985233565]
[51.7, 48.3]
[51.7, 48.3]
2
['No', 'Yes']
2


  Dependents Churn     0
0         No    No  3390
1         No   Yes  1543
2        Yes    No  1784
3        Yes   Yes   326
  Dependents Churn  Count
0         No    No   3390
1         No   Yes   1543
2        Yes    No   1784
3        Yes   Yes    326
  index  Dependents
0    No        4933
1   Yes        2110
['No', 'Yes']
[4933, 2110]
[0.7004117563538265, 0.2995882436461735]
[70.0, 30.0]
[70.0, 30.0]
2
['No', 'Yes']
2


In [77]:
bar_chart("PhoneService")
bar_chart("MultipleLines")
bar_chart("InternetService")
bar_chart("OnlineSecurity")
bar_chart("OnlineBackup")
bar_chart("DeviceProtection")
bar_chart("TechSupport")
bar_chart("StreamingTV")
bar_chart("StreamingMovies")

  PhoneService Churn     0
0           No    No   512
1           No   Yes   170
2          Yes    No  4662
3          Yes   Yes  1699
  PhoneService Churn  Count
0           No    No    512
1           No   Yes    170
2          Yes    No   4662
3          Yes   Yes   1699
  index  PhoneService
0   Yes          6361
1    No           682
['Yes', 'No']
[6361, 682]
[0.9031662643759761, 0.09683373562402385]
[90.3, 9.7]
[90.3, 9.7]
2
['Yes', 'No']
2


      MultipleLines Churn     0
0                No    No  2541
1                No   Yes   849
2  No phone service    No   512
3  No phone service   Yes   170
4               Yes    No  2121
5               Yes   Yes   850
      MultipleLines Churn  Count
0                No    No   2541
1                No   Yes    849
2  No phone service    No    512
3  No phone service   Yes    170
4               Yes    No   2121
5               Yes   Yes    850
              index  MultipleLines
0                No           3390
1               Yes           2971
2  No phone service            682
['No', 'Yes', 'No phone service']
[3390, 2971, 682]
[0.48132897912821243, 0.42183728524776376, 0.09683373562402385]
[48.1, 42.2, 9.7]
[48.1, 42.2, 9.7]
3
['No', 'Yes', 'No phone service']
3


  InternetService Churn     0
0             DSL    No  1962
1             DSL   Yes   459
2     Fiber optic    No  1799
3     Fiber optic   Yes  1297
4              No    No  1413
5              No   Yes   113
  InternetService Churn  Count
0             DSL    No   1962
1             DSL   Yes    459
2     Fiber optic    No   1799
3     Fiber optic   Yes   1297
4              No    No   1413
5              No   Yes    113
         index  InternetService
0  Fiber optic             3096
1          DSL             2421
2           No             1526
['Fiber optic', 'DSL', 'No']
[3096, 2421, 1526]
[0.4395854039471816, 0.34374556297032516, 0.21666903308249325]
[44.0, 34.4, 21.7]
[44.0, 34.4, 21.7]
3
['Fiber optic', 'DSL', 'No']
3


        OnlineSecurity Churn     0
0                   No    No  2037
1                   No   Yes  1461
2  No internet service    No  1413
3  No internet service   Yes   113
4                  Yes    No  1724
5                  Yes   Yes   295
        OnlineSecurity Churn  Count
0                   No    No   2037
1                   No   Yes   1461
2  No internet service    No   1413
3  No internet service   Yes    113
4                  Yes    No   1724
5                  Yes   Yes    295
                 index  OnlineSecurity
0                   No            3498
1                  Yes            2019
2  No internet service            1526
['No', 'Yes', 'No internet service']
[3498, 2019, 1526]
[0.4966633536845094, 0.2866676132329973, 0.21666903308249325]
[49.7, 28.7, 21.7]
[49.7, 28.7, 21.7]
3
['No', 'Yes', 'No internet service']
3


          OnlineBackup Churn     0
0                   No    No  1855
1                   No   Yes  1233
2  No internet service    No  1413
3  No internet service   Yes   113
4                  Yes    No  1906
5                  Yes   Yes   523
          OnlineBackup Churn  Count
0                   No    No   1855
1                   No   Yes   1233
2  No internet service    No   1413
3  No internet service   Yes    113
4                  Yes    No   1906
5                  Yes   Yes    523
                 index  OnlineBackup
0                   No          3088
1                  Yes          2429
2  No internet service          1526
['No', 'Yes', 'No internet service']
[3088, 2429, 1526]
[0.43844952435041884, 0.3448814425670879, 0.21666903308249325]
[43.8, 34.5, 21.7]
[43.8, 34.5, 21.7]
3
['No', 'Yes', 'No internet service']
3


      DeviceProtection Churn     0
0                   No    No  1884
1                   No   Yes  1211
2  No internet service    No  1413
3  No internet service   Yes   113
4                  Yes    No  1877
5                  Yes   Yes   545
      DeviceProtection Churn  Count
0                   No    No   1884
1                   No   Yes   1211
2  No internet service    No   1413
3  No internet service   Yes    113
4                  Yes    No   1877
5                  Yes   Yes    545
                 index  DeviceProtection
0                   No              3095
1                  Yes              2422
2  No internet service              1526
['No', 'Yes', 'No internet service']
[3095, 2422, 1526]
[0.43944341899758627, 0.3438875479199205, 0.21666903308249325]
[43.9, 34.4, 21.7]
[43.9, 34.4, 21.7]
3
['No', 'Yes', 'No internet service']
3


           TechSupport Churn     0
0                   No    No  2027
1                   No   Yes  1446
2  No internet service    No  1413
3  No internet service   Yes   113
4                  Yes    No  1734
5                  Yes   Yes   310
           TechSupport Churn  Count
0                   No    No   2027
1                   No   Yes   1446
2  No internet service    No   1413
3  No internet service   Yes    113
4                  Yes    No   1734
5                  Yes   Yes    310
                 index  TechSupport
0                   No         3473
1                  Yes         2044
2  No internet service         1526
['No', 'Yes', 'No internet service']
[3473, 2044, 1526]
[0.4931137299446259, 0.2902172369728809, 0.21666903308249325]
[49.3, 29.0, 21.7]
[49.3, 29.0, 21.7]
3
['No', 'Yes', 'No internet service']
3


           StreamingTV Churn     0
0                   No    No  1868
1                   No   Yes   942
2  No internet service    No  1413
3  No internet service   Yes   113
4                  Yes    No  1893
5                  Yes   Yes   814
           StreamingTV Churn  Count
0                   No    No   1868
1                   No   Yes    942
2  No internet service    No   1413
3  No internet service   Yes    113
4                  Yes    No   1893
5                  Yes   Yes    814
                 index  StreamingTV
0                   No         2810
1                  Yes         2707
2  No internet service         1526
['No', 'Yes', 'No internet service']
[2810, 2707, 1526]
[0.3989777083629135, 0.38435325855459324, 0.21666903308249325]
[39.9, 38.4, 21.7]
[39.9, 38.4, 21.7]
3
['No', 'Yes', 'No internet service']
3


       StreamingMovies Churn     0
0                   No    No  1847
1                   No   Yes   938
2  No internet service    No  1413
3  No internet service   Yes   113
4                  Yes    No  1914
5                  Yes   Yes   818
       StreamingMovies Churn  Count
0                   No    No   1847
1                   No   Yes    938
2  No internet service    No   1413
3  No internet service   Yes    113
4                  Yes    No   1914
5                  Yes   Yes    818
                 index  StreamingMovies
0                   No             2785
1                  Yes             2732
2  No internet service             1526
['No', 'Yes', 'No internet service']
[2785, 2732, 1526]
[0.39542808462303, 0.3879028822944768, 0.21666903308249325]
[39.5, 38.8, 21.7]
[39.5, 38.8, 21.7]
3
['No', 'Yes', 'No internet service']
3


In [89]:
bar_chart("Contract")
bar_chart("PaperlessBilling")
bar_chart("PaymentMethod")

         Contract Churn     0
0  Month-to-month    No  2220
1  Month-to-month   Yes  1655
2        One year    No  1307
3        One year   Yes   166
4        Two year    No  1647
5        Two year   Yes    48
         Contract Churn  Count
0  Month-to-month    No   2220
1  Month-to-month   Yes   1655
2        One year    No   1307
3        One year   Yes    166
4        Two year    No   1647
5        Two year   Yes     48
            index  Contract
0  Month-to-month      3875
1        Two year      1695
2        One year      1473
['Month-to-month', 'Two year', 'One year']
[3875, 1695, 1473]
[0.5501916796819537, 0.24066448956410622, 0.20914383075394008]
[55.0, 24.1, 20.9]
[55.0, 24.1, 20.9]
3
['Month-to-month', 'Two year', 'One year']
3


  PaperlessBilling Churn     0
0               No    No  2403
1               No   Yes   469
2              Yes    No  2771
3              Yes   Yes  1400
  PaperlessBilling Churn  Count
0               No    No   2403
1               No   Yes    469
2              Yes    No   2771
3              Yes   Yes   1400
  index  PaperlessBilling
0   Yes              4171
1    No              2872
['Yes', 'No']
[4171, 2872]
[0.5922192247621753, 0.4077807752378248]
[59.2, 40.8]
[59.2, 40.8]
2
['Yes', 'No']
2


               PaymentMethod Churn     0
0  Bank transfer (automatic)    No  1286
1  Bank transfer (automatic)   Yes   258
2    Credit card (automatic)    No  1290
3    Credit card (automatic)   Yes   232
4           Electronic check    No  1294
5           Electronic check   Yes  1071
6               Mailed check    No  1304
7               Mailed check   Yes   308
               PaymentMethod Churn  Count
0  Bank transfer (automatic)    No   1286
1  Bank transfer (automatic)   Yes    258
2    Credit card (automatic)    No   1290
3    Credit card (automatic)   Yes    232
4           Electronic check    No   1294
5           Electronic check   Yes   1071
6               Mailed check    No   1304
7               Mailed check   Yes    308
                       index  PaymentMethod
0           Electronic check           2365
1               Mailed check           1612
2  Bank transfer (automatic)           1544
3    Credit card (automatic)           1522
['Electronic check', 'Mailed chec

### Explore Numeric features

In [15]:
dts_dtframe.dtypes

customerID           object
gender               object
SeniorCitizen        object
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [16]:
try:
    dts_dtframe["TotalCharges"] = dts_dtframe["TotalCharges"].astype(float)
except ValueError as val_err:
    print(val_err)

could not convert string to float: ''


In [17]:
dts_dtframe["TotalCharges"] = pdas.to_numeric(
    dts_dtframe["TotalCharges"], errors="coerce"
)
# Fill the missing values with with the median value
dts_dtframe["TotalCharges"] = dts_dtframe["TotalCharges"].fillna(
    dts_dtframe["TotalCharges"].median()
)

In [92]:
# Defining the histogram plotting function
def hist(feature):
    group_df = dts_dtframe.groupby([feature, "Churn"]).size().reset_index()
    print(group_df)
    group_df = group_df.rename(columns={0: "Count"})
    print(group_df)
    chart = plotly_ex.histogram(
        group_df,
        x=feature,
        y="Count",
        color="Churn",
        marginal="box",
        title=f"Churn rate frequency to {feature} distribution",
        color_discrete_sequence=["blue", "red"],
        width=900
    )
    chart.show()

In [93]:
hist("tenure")
hist("MonthlyCharges")
hist("TotalCharges")

     tenure Churn    0
0         0    No   11
1         1    No  233
2         1   Yes  380
3         2    No  115
4         2   Yes  123
..      ...   ...  ...
140      70   Yes   11
141      71    No  164
142      71   Yes    6
143      72    No  356
144      72   Yes    6

[145 rows x 3 columns]
     tenure Churn  Count
0         0    No     11
1         1    No    233
2         1   Yes    380
3         2    No    115
4         2   Yes    123
..      ...   ...    ...
140      70   Yes     11
141      71    No    164
142      71   Yes      6
143      72    No    356
144      72   Yes      6

[145 rows x 3 columns]


      MonthlyCharges Churn  0
0              18.25    No  1
1              18.40    No  1
2              18.55    No  1
3              18.70    No  2
4              18.75    No  1
...              ...   ... ..
2365          118.20    No  1
2366          118.35   Yes  1
2367          118.60    No  2
2368          118.65    No  1
2369          118.75    No  1

[2370 rows x 3 columns]
      MonthlyCharges Churn  Count
0              18.25    No      1
1              18.40    No      1
2              18.55    No      1
3              18.70    No      2
4              18.75    No      1
...              ...   ...    ...
2365          118.20    No      1
2366          118.35   Yes      1
2367          118.60    No      2
2368          118.65    No      1
2369          118.75    No      1

[2370 rows x 3 columns]


      TotalCharges Churn  0
0            18.80    No  1
1            18.85    No  1
2            18.85   Yes  1
3            18.90    No  1
4            19.00    No  1
...            ...   ... ..
6693       8564.75    No  1
6694       8594.40    No  1
6695       8670.10    No  1
6696       8672.45    No  1
6697       8684.80   Yes  1

[6698 rows x 3 columns]
      TotalCharges Churn  Count
0            18.80    No      1
1            18.85    No      1
2            18.85   Yes      1
3            18.90    No      1
4            19.00    No      1
...            ...   ...    ...
6693       8564.75    No      1
6694       8594.40    No      1
6695       8670.10    No      1
6696       8672.45    No      1
6697       8684.80   Yes      1

[6698 rows x 3 columns]


In [98]:
# Create an empty DataFrame
bin_df = pdas.DataFrame()

# Update the binning DataFrame
bin_df["tenure_bins"] = pdas.qcut(
    dts_dtframe["tenure"], q=3, labels=["low", "medium", "high"]
)
bin_df["MonthlyCharges_bins"] = pdas.qcut(
    dts_dtframe["MonthlyCharges"], q=3, labels=["low", "medium", "high"]
)
bin_df["TotalCharges_bins"] = pdas.qcut(
    dts_dtframe["TotalCharges"], q=3, labels=["low", "medium", "high"]
)
bin_df["Churn"] = dts_dtframe["Churn"]

# Plot the bar chart of the binned variables
bar_chart("tenure_bins", bin_df)
bar_chart("MonthlyCharges_bins", bin_df)
bar_chart("TotalCharges_bins", bin_df)

  tenure_bins Churn     0
0         low    No  1272
1         low   Yes  1099
2      medium    No  1821
3      medium   Yes   548
4        high    No  2081
5        high   Yes   222
  tenure_bins Churn  Count
0         low    No   1272
1         low   Yes   1099
2      medium    No   1821
3      medium   Yes    548
4        high    No   2081
5        high   Yes    222
    index  tenure_bins
0     low         2371
1  medium         2369
2    high         2303
['low', 'medium', 'high']
[2371, 2369, 2303]
[0.336646315490558, 0.3363623455913673, 0.3269913389180747]
[33.7, 33.6, 32.7]
[33.7, 33.6, 32.7]
3
['low', 'medium', 'high']
3


  MonthlyCharges_bins Churn     0
0                 low    No  1978
1                 low   Yes   373
2              medium    No  1649
3              medium   Yes   696
4                high    No  1547
5                high   Yes   800
  MonthlyCharges_bins Churn  Count
0                 low    No   1978
1                 low   Yes    373
2              medium    No   1649
3              medium   Yes    696
4                high    No   1547
5                high   Yes    800
    index  MonthlyCharges_bins
0     low                 2351
1    high                 2347
2  medium                 2345
['low', 'high', 'medium']
[2351, 2347, 2345]
[0.3338066164986511, 0.3332386767002698, 0.3329547068010791]
[33.4, 33.3, 33.3]
[33.4, 33.3, 33.3]
3
['low', 'high', 'medium']
3


  TotalCharges_bins Churn     0
0               low    No  1422
1               low   Yes   926
2            medium    No  1792
3            medium   Yes   555
4              high    No  1960
5              high   Yes   388
  TotalCharges_bins Churn  Count
0               low    No   1422
1               low   Yes    926
2            medium    No   1792
3            medium   Yes    555
4              high    No   1960
5              high   Yes    388
    index  TotalCharges_bins
0     low               2348
1    high               2348
2  medium               2347
['low', 'high', 'medium']
[2348, 2348, 2347]
[0.33338066164986513, 0.33338066164986513, 0.3332386767002698]
[33.3, 33.3, 33.3]
[33.3, 33.3, 33.3]
3
['low', 'high', 'medium']
3


### Data preprocessing

In [None]:
# The customerID column is not useful as the feature us used for identification of customers.
dts_dtframe.drop(["customerID"], axis=1, inplace=True)

# Encode categorical features
# Defining the map function
def binary_map(feature):
    return feature.map({"Yes": 1, "No": 0})


# Encoding target feature
dts_dtframe["Churn"] = dts_dtframe[["Churn"]].apply(binary_map)

# Encoding gender category
dts_dtframe["gender"] = dts_dtframe["gender"].map({"Male": 1, "Female": 0})

# Encoding other binary category
binary_list = [
    "SeniorCitizen",
    "Partner",
    "Dependents",
    "PhoneService",
    "PaperlessBilling",
]
dts_dtframe[binary_list] = dts_dtframe[binary_list].apply(binary_map)

# Encoding the other categoric features with more than two categories
dts_dtframe = pdas.get_dummies(dts_dtframe, drop_first=True)

In [None]:
# Checking the correlation between features
corr = dts_dtframe.corr()

chart = plotly_ex.imshow(corr, width=1000, height=1000)
chart.show()

Correlation is a statistical term is a measure on linear relationship with two variables. Features with high correlation are more linearly dependent and have almost the same effect on the dependent variable. So when two features have a high correlation, we can drop one of the two features.

In [None]:
import statsmodels.api as sm_api
import statsmodels.formula.api as sm_f_api

# Change variable name seperators to '_'
all_columns = [
    column.replace(" ", "_").replace("(", "_").replace(")", "_").replace("-", "_")
    for column in dts_dtframe.columns
]

# Effect the change to the DataFrame column names
dts_dtframe.columns = all_columns

# Prepare it for the GLM formula
glm_columns = [e for e in all_columns if e not in ["customerID", "Churn"]]
glm_columns = " + ".join(map(str, glm_columns))

# Fiting it to the Generalized Linear Model
glm_model = sm_f_api.glm(
    formula=f"Churn ~ {glm_columns}",
    data=dts_dtframe,
    family=sm_api.families.Binomial(),
)
res = glm_model.fit()
print(res.summary())

In [None]:
npy.exp(res.params)

In [None]:
# Feature scaling
from sklearn.preprocessing import MinMaxScaler

sc = MinMaxScaler()
dts_dtframe["tenure"] = sc.fit_transform(dts_dtframe[["tenure"]])
dts_dtframe["MonthlyCharges"] = sc.fit_transform(dts_dtframe[["MonthlyCharges"]])
dts_dtframe["TotalCharges"] = sc.fit_transform(dts_dtframe[["TotalCharges"]])

#### Creating a baseline model

In [None]:
# Import Machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Import metric for performance evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split data into train and test sets
from sklearn.model_selection import train_test_split

X = dts_dtframe.drop("Churn", axis=1)
y = dts_dtframe["Churn"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=50
)  

In [None]:
def modeling(alg, alg_name, params={}):
    model = alg(
        **params
    )  # Instantiating the algorithm class and unpacking parameters if any
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Performance evaluation
    def print_scores(alg, y_true, y_pred):
        print(alg_name)
        acc_score = accuracy_score(y_true, y_pred)
        print("accuracy: ", acc_score)
        pre_score = precision_score(y_true, y_pred)
        print("precision: ", pre_score)
        rec_score = recall_score(y_true, y_pred)
        print("recall: ", rec_score)
        f_score = f1_score(y_true, y_pred, average="weighted")
        print("f1_score: ", f_score)

    print_scores(alg, y_test, y_pred)
    return model

In [None]:
# Running logistic regression model
log_model = modeling(LogisticRegression, "Logistic Regression")

In [None]:
# Feature selection to improve model building
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

log = LogisticRegression()
rfecv = RFECV(
    estimator=log,
    cv=StratifiedKFold(10, random_state=50, shuffle=True),
    scoring="accuracy",
)
rfecv.fit(X, y)

In [None]:
matplot_plt.figure(figsize=(8, 6))
matplot_plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
matplot_plt.grid()
matplot_plt.xticks(range(1, X.shape[1] + 1))
matplot_plt.xlabel("Number of Selected Features")
matplot_plt.ylabel("CV Score")
matplot_plt.title("Recursive Feature Elimination (RFE)")
matplot_plt.show()

print("The optimal number of features: {}".format(rfecv.n_features_))

In [None]:
# Saving DataFrame with optimal features
X_rfe = X.iloc[:, rfecv.support_]

# Overview of the optimal features in comparison with the intial DataFrame
print('"X" dimension: {}'.format(X.shape))
print('"X" column list:', X.columns.tolist())
print('"X_rfe" dimension: {}'.format(X_rfe.shape))
print('"X_rfe" column list:', X_rfe.columns.tolist())

In [None]:
# Splitting data with optimal features
X_train, X_test, y_train, y_test = train_test_split(
    X_rfe, y, test_size=0.3, random_state=50
) 

In [None]:
# Running logistic regression model
log_model = modeling(LogisticRegression, "Logistic Regression Classification")

Trying other machine learning algorithms:

In [None]:
# SVC
svc_model = modeling(SVC, "SVC Classification")

In [None]:
# Random forest
rf_model = modeling(RandomForestClassifier, "Random Forest Classification")

In [None]:
# Decision tree
dt_model = modeling(DecisionTreeClassifier, "Decision Tree Classification")

In [None]:
# Naive bayes
nb_model = modeling(GaussianNB, "Naive Bayes Classification")

In [None]:
# Improve best model by hyperparameter tuning
# Define model
model = LogisticRegression()

# Define evaluation
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Define search space
from scipy.stats import loguniform

space = dict()
space["solver"] = ["newton-cg", "lbfgs", "liblinear"]
space["penalty"] = ["none", "l1", "l2", "elasticnet"]
space["C"] = loguniform(1e-5, 1000)

# Define search
from sklearn.model_selection import RandomizedSearchCV

search = RandomizedSearchCV(
    model, space, n_iter=500, scoring="accuracy", n_jobs=-1, cv=cv, random_state=1
)

# Execute search
result = search.fit(X_rfe, y)
# Summarize result
print("Best Score: %s" % result.best_score_)
print("Best Hyperparameters: %s" % result.best_params_)

In [None]:
params = result.best_params_
params

In [None]:
# Improving the Logistic Regression model
log_model = modeling(
    LogisticRegression, "Logistic Regression Classification", params=params
)

In [None]:
# Save the model to device
filename = "model.sav"
joblib.dump(log_model, filename)