In [1]:
import pandas as pd
import torch
from torch import optim, nn
from pathlib import Path

# Viewing the raw data

In [2]:
raw_data = pd.read_csv(Path('./WA_Fn-UseC_-Telco-Customer-Churn.csv'), low_memory=False)
raw_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
def print_unique(df: pd.DataFrame) -> None:
    """Prints the column names and their unique values."""
    if isinstance(df, pd.core.series.Series):
        print(f"{df.unique()}")
    elif isinstance(df, pd.core.frame.DataFrame):
        for col in df:
            print(f"{col}: {df[col].unique()}")
    else:
        raise TypeError(f"Expected DataFrame or Series, recieved {type(df)}")

In [4]:
print_unique(raw_data)

customerID: ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
gender: ['Female' 'Male']
SeniorCitizen: [0 1]
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService: ['No' 'Yes']
MultipleLines: ['No phone service' 'No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes' 'No internet service']
OnlineBackup: ['Yes' 'No' 'No internet service']
DeviceProtection: ['No' 'Yes' 'No internet service']
TechSupport: ['No' 'Yes' 'No internet service']
StreamingTV: ['No' 'Yes' 'No internet service']
StreamingMovies: ['No' 'Yes' 'No internet service']
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)

## Drop irrelevant columns

In [5]:
cleaned_data = raw_data.drop(labels=['customerID'], axis=1)

## Reduce unessecary variable cardinality

In [6]:
cleaned_data['TechSupport'].replace('No internet service', 'No', inplace=True)
cleaned_data['StreamingTV'].replace('No internet service', 'No', inplace=True)
cleaned_data['MultipleLines'].replace('No phone service', 'No', inplace=True)
cleaned_data['OnlineSecurity'].replace('No internet service', 'No', inplace=True)
cleaned_data['OnlineBackup'].replace('No internet service', 'No', inplace=True)
cleaned_data['DeviceProtection'].replace('No internet service', 'No', inplace=True)
cleaned_data['StreamingMovies'].replace('No internet service', 'No', inplace=True)
print_unique(cleaned_data)

gender: ['Female' 'Male']
SeniorCitizen: [0 1]
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService: ['No' 'Yes']
MultipleLines: ['No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes']
OnlineBackup: ['Yes' 'No']
DeviceProtection: ['No' 'Yes']
TechSupport: ['No' 'Yes']
StreamingTV: ['No' 'Yes']
StreamingMovies: ['No' 'Yes']
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn: ['No' 'Yes']


## These columns will be encoded as 1 - Yes or 0 - No

In [7]:
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
           'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']

In [8]:
cleaned_data['gender'].replace({"Male": 1, "Female": 0}, inplace=True)
print_unique(cleaned_data['gender'])

[0 1]


In [9]:
for column in binary_cols:
    cleaned_data[column].replace({"Yes": 1, "No": 0}, inplace=True)
print_unique(cleaned_data)

gender: [0 1]
SeniorCitizen: [0 1]
Partner: [1 0]
Dependents: [0 1]
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService: [0 1]
MultipleLines: [0 1]
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: [0 1]
OnlineBackup: [1 0]
DeviceProtection: [0 1]
TechSupport: [0 1]
StreamingTV: [0 1]
StreamingMovies: [0 1]
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: [1 0]
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn: [0 1]


## One-Hot Encode Non-Binary Categoricals

In [10]:
one_hot_vars = ["Contract", "PaymentMethod", "InternetService"]
cleaned_data = pd.get_dummies(cleaned_data, columns=one_hot_vars)
print_unique(cleaned_data)

gender: [0 1]
SeniorCitizen: [0 1]
Partner: [1 0]
Dependents: [0 1]
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService: [0 1]
MultipleLines: [0 1]
OnlineSecurity: [0 1]
OnlineBackup: [1 0]
DeviceProtection: [0 1]
TechSupport: [0 1]
StreamingTV: [0 1]
StreamingMovies: [0 1]
PaperlessBilling: [1 0]
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn: [0 1]
Contract_Month-to-month: [1 0]
Contract_One year: [0 1]
Contract_Two year: [0 1]
PaymentMethod_Bank transfer (automatic): [0 1]
PaymentMethod_Credit card (automatic): [0 1]
PaymentMethod_Electronic check: [1 0]
PaymentMethod_Mailed check: [0 1]
InternetService_DSL: [1 0]
InternetService_Fiber optic: [0 1]
InternetService_No: [0 1]


## Investigate the TotalCharges variable

In [11]:
cleaned_data['TotalCharges'].value_counts()[' ']

11

In [12]:
pd.set_option("display.max_columns", None)
odd_rows_raw = raw_data[cleaned_data['TotalCharges'].map(lambda x: x=='' or x==' ')]
odd_rows = cleaned_data[cleaned_data['TotalCharges'].map(lambda x: x=='' or x==' ')]
display(odd_rows_raw)    # To see unencoded, human readable data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


## Fill the empty cells

In [13]:
# None of them actually churned, so value replacement is easy
odd_rows.loc[odd_rows['Contract_One year'] == 1, 'TotalCharges'] = odd_rows[odd_rows['Contract_One year'] == 1]['MonthlyCharges'] * 12
odd_rows.loc[odd_rows['Contract_Two year'] == 1, 'TotalCharges'] = odd_rows[odd_rows['Contract_Two year'] == 1]['MonthlyCharges'] * 24
display(odd_rows)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,InternetService_DSL,InternetService_Fiber optic,InternetService_No
488,0,0,1,1,0,0,0,1,0,1,1,1,0,1,52.55,1261.2,0,0,0,1,1,0,0,0,1,0,0
753,1,0,0,1,0,1,0,0,0,0,0,0,0,0,20.25,486.0,0,0,0,1,0,0,0,1,0,0,1
936,0,0,1,1,0,1,0,1,1,1,0,1,1,0,80.85,1940.4,0,0,0,1,0,0,0,1,1,0,0
1082,1,0,1,1,0,1,1,0,0,0,0,0,0,0,25.75,618.0,0,0,0,1,0,0,0,1,0,0,1
1340,0,0,1,1,0,0,0,1,1,1,1,1,0,0,56.05,1345.2,0,0,0,1,0,1,0,0,1,0,0
3331,1,0,1,1,0,1,0,0,0,0,0,0,0,0,19.85,476.4,0,0,0,1,0,0,0,1,0,0,1
3826,1,0,1,1,0,1,1,0,0,0,0,0,0,0,25.35,608.4,0,0,0,1,0,0,0,1,0,0,1
4380,0,0,1,1,0,1,0,0,0,0,0,0,0,0,20.0,480.0,0,0,0,1,0,0,0,1,0,0,1
5218,1,0,1,1,0,1,0,0,0,0,0,0,0,1,19.7,236.4,0,0,1,0,0,0,0,1,0,0,1
6670,0,0,1,1,0,1,1,0,1,1,1,1,0,0,73.35,1760.4,0,0,0,1,0,0,0,1,1,0,0


In [14]:
cleaned_data.update(odd_rows)
cleaned_data.iloc[488]['TotalCharges']

1261.1999999999998

## Looks good, now convert all strs to floats

In [15]:
print_unique(cleaned_data)

gender: [0. 1.]
SeniorCitizen: [0. 1.]
Partner: [1. 0.]
Dependents: [0. 1.]
tenure: [ 1. 34.  2. 45.  8. 22. 10. 28. 62. 13. 16. 58. 49. 25. 69. 52. 71. 21.
 12. 30. 47. 72. 17. 27.  5. 46. 11. 70. 63. 43. 15. 60. 18. 66.  9.  3.
 31. 50. 64. 56.  7. 42. 35. 48. 29. 65. 38. 68. 32. 55. 37. 36. 41.  6.
  4. 33. 67. 23. 57. 61. 14. 20. 53. 40. 59. 24. 44. 19. 54. 51. 26.  0.
 39.]
PhoneService: [0. 1.]
MultipleLines: [0. 1.]
OnlineSecurity: [0. 1.]
OnlineBackup: [1. 0.]
DeviceProtection: [0. 1.]
TechSupport: [0. 1.]
StreamingTV: [0. 1.]
StreamingMovies: [0. 1.]
PaperlessBilling: [1. 0.]
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn: [0. 1.]
Contract_Month-to-month: [1. 0.]
Contract_One year: [0. 1.]
Contract_Two year: [0. 1.]
PaymentMethod_Bank transfer (automatic): [0. 1.]
PaymentMethod_Credit card (automatic): [0. 1.]
PaymentMethod_Electronic check: [1. 0.]
PaymentMethod_Mailed check: [0. 1.]
Inte

In [16]:
diff_df = cleaned_data.loc[:, cleaned_data.columns.difference(['MonthlyCharges', 'TotalCharges', 'tenure'])]

for col in diff_df:
    cleaned_data[col] = cleaned_data[col].astype('int')
print_unique(cleaned_data)

gender: [0 1]
SeniorCitizen: [0 1]
Partner: [1 0]
Dependents: [0 1]
tenure: [ 1. 34.  2. 45.  8. 22. 10. 28. 62. 13. 16. 58. 49. 25. 69. 52. 71. 21.
 12. 30. 47. 72. 17. 27.  5. 46. 11. 70. 63. 43. 15. 60. 18. 66.  9.  3.
 31. 50. 64. 56.  7. 42. 35. 48. 29. 65. 38. 68. 32. 55. 37. 36. 41.  6.
  4. 33. 67. 23. 57. 61. 14. 20. 53. 40. 59. 24. 44. 19. 54. 51. 26.  0.
 39.]
PhoneService: [0 1]
MultipleLines: [0 1]
OnlineSecurity: [0 1]
OnlineBackup: [1 0]
DeviceProtection: [0 1]
TechSupport: [0 1]
StreamingTV: [0 1]
StreamingMovies: [0 1]
PaperlessBilling: [1 0]
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn: [0 1]
Contract_Month-to-month: [1 0]
Contract_One year: [0 1]
Contract_Two year: [0 1]
PaymentMethod_Bank transfer (automatic): [0 1]
PaymentMethod_Credit card (automatic): [0 1]
PaymentMethod_Electronic check: [1 0]
PaymentMethod_Mailed check: [0 1]
InternetService_DSL: [1 0]
InternetService_Fib

In [17]:
print(f"TotalCharges dtype before {cleaned_data['TotalCharges'].dtype}")
cleaned_data['TotalCharges'] = pd.to_numeric(cleaned_data['TotalCharges'])
print(f"TotalCharges dtype after {cleaned_data['TotalCharges'].dtype}")

TotalCharges dtype before object
TotalCharges dtype after float64


In [18]:
print_unique(cleaned_data)

gender: [0 1]
SeniorCitizen: [0 1]
Partner: [1 0]
Dependents: [0 1]
tenure: [ 1. 34.  2. 45.  8. 22. 10. 28. 62. 13. 16. 58. 49. 25. 69. 52. 71. 21.
 12. 30. 47. 72. 17. 27.  5. 46. 11. 70. 63. 43. 15. 60. 18. 66.  9.  3.
 31. 50. 64. 56.  7. 42. 35. 48. 29. 65. 38. 68. 32. 55. 37. 36. 41.  6.
  4. 33. 67. 23. 57. 61. 14. 20. 53. 40. 59. 24. 44. 19. 54. 51. 26.  0.
 39.]
PhoneService: [0 1]
MultipleLines: [0 1]
OnlineSecurity: [0 1]
OnlineBackup: [1 0]
DeviceProtection: [0 1]
TechSupport: [0 1]
StreamingTV: [0 1]
StreamingMovies: [0 1]
PaperlessBilling: [1 0]
MonthlyCharges: [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges: [  29.85 1889.5   108.15 ...  346.45  306.6  6844.5 ]
Churn: [0 1]
Contract_Month-to-month: [1 0]
Contract_One year: [0 1]
Contract_Two year: [0 1]
PaymentMethod_Bank transfer (automatic): [0 1]
PaymentMethod_Credit card (automatic): [0 1]
PaymentMethod_Electronic check: [1 0]
PaymentMethod_Mailed check: [0 1]
InternetService_DSL: [1 0]
InternetService_Fiber o

## Training/Validation split

In [19]:
X,y = cleaned_data.loc[:, cleaned_data.columns != 'Churn'], cleaned_data['Churn']
display(X)
display(y)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,MonthlyCharges,TotalCharges,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,InternetService_DSL,InternetService_Fiber optic,InternetService_No
0,0,0,1,0,1.0,0,0,0,1,0,0,0,0,1,29.85,29.85,1,0,0,0,0,1,0,1,0,0
1,1,0,0,0,34.0,1,0,1,0,1,0,0,0,0,56.95,1889.50,0,1,0,0,0,0,1,1,0,0
2,1,0,0,0,2.0,1,0,1,1,0,0,0,0,1,53.85,108.15,1,0,0,0,0,0,1,1,0,0
3,1,0,0,0,45.0,0,0,1,0,1,1,0,0,0,42.30,1840.75,0,1,0,1,0,0,0,1,0,0
4,0,0,0,0,2.0,1,0,0,0,0,0,0,0,1,70.70,151.65,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24.0,1,1,1,0,1,1,1,1,1,84.80,1990.50,0,1,0,0,0,0,1,1,0,0
7039,0,0,1,1,72.0,1,1,0,1,1,0,1,1,1,103.20,7362.90,0,1,0,0,1,0,0,0,1,0
7040,0,0,1,1,11.0,0,0,1,0,0,0,0,0,1,29.60,346.45,1,0,0,0,0,1,0,1,0,0
7041,1,1,1,0,4.0,1,1,0,0,0,0,0,0,1,74.40,306.60,1,0,0,0,0,0,1,0,1,0


0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

# Note
It's important to split the data before applying normalization techniques because we want to keep out test/validation set data completely separate from our training information to avoid introducing future information into our model. `sklearn`'s scalers, like `MinMaxScaler` for example, scale in two distinct steps. First, `fit()` which computes nessecary parameters like `scaler.min_` and `scaler.data_max_`. Then, `transform()` which actually transforms the passed data. So, we'll `fit(X_train)` then `transform()` both.

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Normalization via scaling between 0-1

In [21]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

In [22]:
scale_cols = ['tenure', 'TotalCharges', 'MonthlyCharges']
min_max_scaler.fit(X_train[scale_cols])
X_train_scaled, X_test_scaled = X_train.copy(), X_test.copy()
X_train_scaled[scale_cols] = min_max_scaler.transform(X_train[scale_cols])
X_test_scaled[scale_cols] = min_max_scaler.transform(X_test[scale_cols])
display(X_test_scaled)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,MonthlyCharges,TotalCharges,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,InternetService_DSL,InternetService_Fiber optic,InternetService_No
185,0,0,1,0,0.013889,0,0,0,0,0,0,0,0,1,0.065272,0.000692,1,0,0,0,0,1,0,1,0,0
2715,1,0,0,0,0.569444,1,1,0,0,0,0,0,0,1,0.069756,0.112814,1,0,0,1,0,0,0,0,0,1
3825,0,0,1,1,0.722222,1,0,0,0,0,0,0,0,0,0.010962,0.116882,0,0,1,0,0,0,1,0,0,1
1807,0,0,0,0,0.013889,1,0,0,0,1,0,0,0,0,0.578974,0.006641,1,0,0,0,0,1,0,0,1,0
132,1,0,0,0,0.930556,1,0,0,0,0,1,0,0,0,0.321873,0.374025,0,0,1,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6366,0,0,1,0,0.888889,1,0,0,1,1,1,0,1,1,0.498754,0.503116,0,0,1,0,0,0,1,1,0,0
315,1,0,1,1,0.708333,1,1,1,1,0,1,1,1,0,0.914798,0.654004,0,1,0,0,1,0,0,0,1,0
2439,1,0,1,1,0.236111,1,0,0,0,0,0,0,0,0,0.016442,0.035882,0,1,0,1,0,0,0,0,0,1
5002,0,0,1,1,0.958333,0,0,1,0,1,0,0,1,1,0.256104,0.339407,0,0,1,0,1,0,0,1,0,0


# Tensor Preprocessing Pipeline

In [23]:
train_ds = [(torch.tensor(X_data, dtype=torch.float, requires_grad=True), torch.tensor(y_data, dtype=torch.float, requires_grad=True)) for X_data, y_data in zip(X_train_scaled.values, y_train.values)]
train_ds[0][0].shape, train_ds[0]


(torch.Size([26]),
 (tensor([0.0000, 0.0000, 0.0000, 1.0000, 0.2917, 1.0000, 0.0000, 1.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.4644, 0.1521, 0.0000, 1.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.0000],
         requires_grad=True),
  tensor(0., requires_grad=True)))

In [24]:
validation_ds = [(torch.tensor(X_data, dtype=torch.float, requires_grad=True), torch.tensor(y_data, dtype=torch.float, requires_grad=True)) for X_data, y_data in zip(X_test_scaled.values, y_test.values)]
validation_ds[0][0].shape, validation_ds[0]


(torch.Size([26]),
 (tensor([0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.3889e-02, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 1.0000e+00, 6.5272e-02, 6.9236e-04, 1.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
          0.0000e+00, 0.0000e+00], requires_grad=True),
  tensor(1., requires_grad=True)))

In [25]:
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=len(train_ds) // 10, shuffle=False)
val_dataloader = torch.utils.data.DataLoader(validation_ds, batch_size=len(validation_ds) // 10, shuffle=False)

# Model

In [159]:
input_neuron_count = len(X.columns)
loss_fcn = nn.BCEWithLogitsLoss()

In [160]:
model = nn.Sequential(
    nn.Linear(input_neuron_count, input_neuron_count // 2),
    nn.ReLU(),
    nn.Linear(input_neuron_count // 2, 1)
)

In [161]:
optimizer = optim.SGD(model.parameters(), lr=1e-1)

In [92]:
def print_t_info(ts: list[torch.tensor], msg: list[str]=None) -> None:
    "Print all relevant tensor info in a pretty interface"
    if isinstance(ts, torch.Tensor): ts = [ts]
    if isinstance(msg, str): msg = [msg]
    if not isinstance(ts, list): raise TypeError(f"Expected tensor or list of tensors got {type(ts)}")
    if not isinstance(msg, list): raise TypeError(f"Expected str or list of strs, go {type(msg)}")
    for count, tensor in enumerate(ts):
        if msg:
            try: print(f"{msg[count]}", end='')
            except IndexError:
                print(f"{msg[-1]}", end='')
            print(f"\tShape: {tensor.shape}\tdtype: {tensor.dtype}\tContiguous: {tensor.is_contiguous()}")
        else:
            print(f"Tensor #{count:02}\tShape: {tensor.shape}\tdtype: {tensor.dtype}\tContiguous: {tensor.is_contiguous()}")
        

# Custom training loop

In [162]:
def training_loop(epochs, model, loss_fcn, optimizer, train_dataloader, val_dataloader,
                  *, save_best=True, metrics=True, logging=True):
    """
    Custom training loop
    Parameters:
        epochs: int, number of epochs to train for
        model: nn.Module or subclass thereof, from which to obtain predictions
        loss_fcn: any pytorch loss function
        optimizer: any optimizer
        train_dataloader: PyTorch dataloader from which to pull data
        val_dataloader: "
        save_best: bool, WARNING only use on smaller models, cache and serialize best model at end of training
        metrics: bool, control calculation and printing of numbers to the screen
        logging: bool, control printing of tensor info to screen after each step
    Returns: 
        Trained model
    """
    
    highest_accuracy = 0
    cached_model = None
    for epoch in range(epochs):
        for features, labels in train_dataloader:
            labels.unsqueeze_(1)
            if logging: print_t_info([features, labels], ["Train Feats: ", "Train Labels: "])
            
            train_predictions = model(features)
            
            train_loss = loss_fcn(train_predictions, labels)
            
            if logging: print_t_info([train_predictions, train_loss], ["Train Preds:", "Train Loss:"])
            
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            
            
        total = 0
        correct = 0
        
        for features, labels in val_dataloader:
            labels.unsqueeze_(1)
            if logging: print_t_info([features, labels], ['Val Feats: ', 'Val Labels:'])
            
            val_predictions = model(features)
            
            val_loss = loss_fcn(val_predictions, labels)
            
            if logging: print_t_info([val_predictions, val_loss], ['Val Preds:', 'Val Loss:'])
            
            break
            
        if metrics:
            total += val_predictions.shape[0]
            correct = int(((val_predictions > 0.5) == labels.type(torch.BoolTensor)).sum())
            print(f"Epoch {epoch:03}\tTrain Loss: {train_loss:.4}\tVal Loss: {val_loss:.4}\tAccuracy: {correct/total:%}")
        if save_best:
            latest_accuracy = correct/total
            if latest_accuracy > highest_accuracy:
                highest_accuracy = latest_accuracy
                cached_model = model
                
    model_pth = Path(f'./models/{highest_accuracy * 100:.5}_model')
    torch.save(cached_model, model_pth)
    print(f"Model saved to {model_pth}")
        
    return model 
            

In [163]:
training_loop(100, model, loss_fcn, optimizer, train_dataloader, val_dataloader, logging=False)

Epoch 000	Train Loss: 0.63	Val Loss: 0.6318	Accuracy: 69.285714%
Epoch 001	Train Loss: 0.5742	Val Loss: 0.6058	Accuracy: 69.285714%
Epoch 002	Train Loss: 0.5367	Val Loss: 0.5905	Accuracy: 69.285714%
Epoch 003	Train Loss: 0.4973	Val Loss: 0.5728	Accuracy: 69.285714%
Epoch 004	Train Loss: 0.4536	Val Loss: 0.5521	Accuracy: 69.285714%
Epoch 005	Train Loss: 0.4061	Val Loss: 0.5303	Accuracy: 69.285714%
Epoch 006	Train Loss: 0.3631	Val Loss: 0.5111	Accuracy: 69.285714%
Epoch 007	Train Loss: 0.3289	Val Loss: 0.4961	Accuracy: 69.285714%
Epoch 008	Train Loss: 0.3028	Val Loss: 0.4807	Accuracy: 69.285714%
Epoch 009	Train Loss: 0.2774	Val Loss: 0.466	Accuracy: 69.285714%
Epoch 010	Train Loss: 0.2515	Val Loss: 0.4531	Accuracy: 69.285714%
Epoch 011	Train Loss: 0.2279	Val Loss: 0.4421	Accuracy: 69.285714%
Epoch 012	Train Loss: 0.2071	Val Loss: 0.4333	Accuracy: 70.000000%
Epoch 013	Train Loss: 0.1896	Val Loss: 0.4258	Accuracy: 72.142857%
Epoch 014	Train Loss: 0.1745	Val Loss: 0.4198	Accuracy: 71.428571

Sequential(
  (0): Linear(in_features=26, out_features=13, bias=True)
  (1): ReLU()
  (2): Linear(in_features=13, out_features=1, bias=True)
)

# 80% Accuracy, not bad
I noticed something strange about the number of samples available for each class.

In [164]:
y.value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

Class imbalance isn't unusual in datasets, so I want to try and balance these two out, using some for of data augmentation, here I'll be using Synthetic Minority Oversampling (SMOTE) from `imblearn.oversampling`.

In [165]:
from imblearn.over_sampling import SMOTE
sampler = SMOTE(random_state=42)

In [166]:
X_res, y_res = sampler.fit_resample(X,y)

In [167]:
y_res.value_counts()

0    5174
1    5174
Name: Churn, dtype: int64

## Much better :)
Let's proceed to train our next model with oversampled data.

### Split

In [172]:
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

### Scale

In [169]:
res_min_max_scaler = MinMaxScaler()

In [174]:
res_min_max_scaler.fit(X_train_res[scale_cols])
X_train_scaled_res, X_test_scaled_res = X_train_res.copy(), X_test_res.copy()
X_train_scaled_res[scale_cols] = res_min_max_scaler.transform(X_train_res[scale_cols])
X_test_scaled_res[scale_cols] = res_min_max_scaler.transform(X_test_res[scale_cols])

### Same tensor pre-processing pipeline

In [176]:
train_res_ds = [(torch.tensor(X_data, dtype=torch.float, requires_grad=True), torch.tensor(y_data, dtype=torch.float, requires_grad=True)) for X_data, y_data in zip(X_train_scaled_res.values, y_train_res.values)]
train_res_ds[0][0].shape, train_res_ds[0]

validation_res_ds = [(torch.tensor(X_data, dtype=torch.float, requires_grad=True), torch.tensor(y_data, dtype=torch.float, requires_grad=True)) for X_data, y_data in zip(X_test_scaled_res.values, y_test_res.values)]
validation_res_ds[0][0].shape, validation_res_ds[0]


(torch.Size([26]),
 (tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.6632, 1.0000, 1.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.7523, 0.5185, 1.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000],
         requires_grad=True),
  tensor(1., requires_grad=True)))

In [177]:
train_res_dataloader = torch.utils.data.DataLoader(train_res_ds, batch_size=len(train_res_ds) // 10, shuffle=False)
val_res_dataloader = torch.utils.data.DataLoader(validation_res_ds, batch_size=len(validation_res_ds) // 10, shuffle=False)


## I'll use the exact same architecutre & hyper-parameters to ensure fair comparison

In [178]:
input_neuron_count = len(X_res.columns)
loss_fcn = nn.BCEWithLogitsLoss()

In [179]:
model = nn.Sequential(
    nn.Linear(input_neuron_count, input_neuron_count // 2),
    nn.ReLU(),
    nn.Linear(input_neuron_count // 2, 1)
)

In [180]:
optimizer = optim.SGD(model.parameters(), lr=1e-1)

In [181]:
training_loop(100, model, loss_fcn, optimizer, train_res_dataloader, val_res_dataloader, logging=False)

Epoch 000	Train Loss: 0.7094	Val Loss: 0.6685	Accuracy: 42.028986%
Epoch 001	Train Loss: 0.6467	Val Loss: 0.6501	Accuracy: 42.028986%
Epoch 002	Train Loss: 0.5938	Val Loss: 0.6241	Accuracy: 42.028986%
Epoch 003	Train Loss: 0.5452	Val Loss: 0.5892	Accuracy: 42.028986%
Epoch 004	Train Loss: 0.4966	Val Loss: 0.549	Accuracy: 55.555556%
Epoch 005	Train Loss: 0.4496	Val Loss: 0.5085	Accuracy: 66.183575%
Epoch 006	Train Loss: 0.4073	Val Loss: 0.4731	Accuracy: 73.913043%
Epoch 007	Train Loss: 0.3711	Val Loss: 0.4449	Accuracy: 74.879227%
Epoch 008	Train Loss: 0.3416	Val Loss: 0.423	Accuracy: 76.811594%
Epoch 009	Train Loss: 0.3189	Val Loss: 0.4076	Accuracy: 77.777778%
Epoch 010	Train Loss: 0.3012	Val Loss: 0.396	Accuracy: 77.294686%
Epoch 011	Train Loss: 0.2872	Val Loss: 0.3872	Accuracy: 78.260870%
Epoch 012	Train Loss: 0.2762	Val Loss: 0.3803	Accuracy: 78.260870%
Epoch 013	Train Loss: 0.2675	Val Loss: 0.3748	Accuracy: 78.260870%
Epoch 014	Train Loss: 0.2605	Val Loss: 0.3702	Accuracy: 79.227053

Sequential(
  (0): Linear(in_features=26, out_features=13, bias=True)
  (1): ReLU()
  (2): Linear(in_features=13, out_features=1, bias=True)
)

# 86.95% Accurate!
Now I'll run it for an absurd amount of epochs, until it clearly overfits, just to ensure we have the best accuracy.

In [None]:
training_loop(1000, model, loss_fcn, optimizer, train_res_dataloader, val_res_dataloader, logging=False)