# Logistic Regression for telco data
- y = Target (churn)
- X = Features 

## Data preparation

In [1]:
# Import needed libraries
import pandas as pd 

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Read data and show first 5 rows
df = pd.read_csv('data/Telco_Customer_Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Convert column names to lower case and replace spaces with underscores

In [3]:
# Restructure the data columns to make it easier to work with
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head(0)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn


Do simillar to the other categorical variables.

In [4]:
# Get colums of data type 'object'
cat_cols = list(df.dtypes[df.dtypes == 'object'].index)
cat_cols

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'totalcharges',
 'churn']

In [5]:
# Loop through the columns and apply the lower case and replace the spaces with underscores
for col in cat_cols:
    df[col] = df[col].str.lower().str.replace(' ', '_')

Convert churn column values to binary (1 and 0)

In [6]:
# Transform yes and no to 1 and 0 respectively
df.churn = (df.churn == 'yes').astype(int)

In [7]:
df['churn'].head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int32

Crosscheck all the data types of the columns

In [8]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                 int32
dtype: object

The 'totalcharges' column seems to be a float but with a data type of object. Hence it will be converted to numeric with the help pandas. Any missing values that may result will be imputed

In [9]:
# Convert totalcharges to numeric
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [10]:
# Fill null values to the mean of the column
df.totalcharges = df.totalcharges.fillna(df.totalcharges.mean())

In [11]:
# Check for missing values in each column
df.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

### Setting up our validation framework

In [12]:
# Import train_test_split from scikit-learn to split the data into training and testing sets
from sklearn.model_selection import train_test_split

Split the dataset into training, validation, and testing sets.

In [13]:
# Split the data into 20% for testing and 80% for training and testing.
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

# Split the training data further into 80% for training and 20% for validation.
# To getting 20% of the entire data, we'll need 25% of the remaining training data
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

In [14]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [15]:
# Get the target values for the training, validation, and test sets.
y_train = df_train.churn.values
y_valid = df_val.churn.values
y_test = df_test.churn.values

In [17]:
# Delete Target variable from the train dataset
del df_train['churn']
del df_test['churn']
del df_val['churn']

### Feature Importance: Mutual Information
It tells us how much we can learn about one variable if we know the value of the other
- Usually done for categorical variables

In [18]:
# import library
from sklearn.metrics import mutual_info_score

Get all categorical variables

In [19]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 
               'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 
               'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 
               'contract', 'paperlessbilling', 'paymentmethod']

numerical = ['monthlycharges', 'tenure', 'totalcharges']

In [20]:
# Fuction to get the mutual information score for each column
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_train_full.churn)

The higher the mutual information score the more important that variable is to our model

In [21]:
# Apply the mutual information to the categorical variables
mi = df_train_full[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

### Feature importance : Correlation
We use correlation to find the feature importance. A positive correlation means that if one feature increases, the other feature also increases. A negative correlation means that if one feature increases, the other feature decreases.
- For numerical varialbles

In [22]:
# Correlation between numerical variable and churn
df_train_full[numerical].corrwith(df_train_full.churn)

monthlycharges    0.196805
tenure           -0.351885
totalcharges     -0.197365
dtype: float64

To check for the most important variable regardless of the direction (positive or negative), you can use absolute. Here the higher the value the more important the variable is.

### One-Hot Encoding

In [24]:
from sklearn.feature_extraction import DictVectorizer

In [25]:
dv = DictVectorizer(sparse=False)

In [26]:
# Transform datasets in to dictionaries
train_dict = df_train[categorical + numerical].to_dict(orient='records')
val_dict = df_val[categorical + numerical].to_dict(orient='records')
test_dict = df_test[categorical + numerical].to_dict(orient='records')

In [27]:
# Fit and transform the training data
X_train = dv.fit_transform(train_dict)
# Only transform the validation and test data
X_valid = dv.transform(val_dict)
X_test = dv.transform(test_dict)

### Model Training
Logistic regression

In [125]:
# Import libraries
from sklearn.linear_model import LogisticRegression

In [126]:
# Define the model and fit or train the model 
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


This is called 'hard prediction' as it predicts weither the output is 0 or 1 (ie, not churn or churn).

In [127]:
# Hard predictions
model.predict(X_train)

array([0, 1, 1, ..., 1, 0, 1])

This is also called soft prediction as it provides the probability of not churning and the probability of churning. The first column is the probability of not churning (0) and the second column is the probability of churning (1).
- Also, by default when the probability is more than 0.5, the customer is predicted to churn, and when the probability is less than 0.5, the customer is predicted to not churn. That is how it uses it to hard predict the results.

In [128]:
# Soft predictions
model.predict_proba(X_train)

array([[0.90889029, 0.09110971],
       [0.31954396, 0.68045604],
       [0.36569921, 0.63430079],
       ...,
       [0.46883538, 0.53116462],
       [0.95692347, 0.04307653],
       [0.30080001, 0.69919999]])

To check if our results match

In [129]:
# Make predictions on validation data, selecting the second variable
y_pred = model.predict_proba(X_valid)[:, 1]
y_pred

array([0.00918802, 0.20171875, 0.21017809, ..., 0.13717159, 0.80015174,
       0.83819462])

In [130]:
churn_pred = (y_pred >= 0.5)
# churn_pred.astype(int)

The model is 80% correct

In [131]:
# Tho churn_pred returns True and False, 
# since False is 0 and True is 1, 
# we can use the mean() method to calculate the accuracy of our model.
(y_valid == churn_pred).mean()

0.8041163946061036

Prediction on the test data

In [None]:
# Hard prediction
test_prediction = model.predict(X_test)
# Soft prediction (ie., with probability), focusing on the second column
test_prediction_prob = model.predict_proba(X_test)[:, 1]

To make it simpler to understand lets view the data in a table format

In [132]:
# Create a datafram
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_pred.astype(int)
df_pred['actual_val'] = y_valid
df_pred['is_correct'] = df_pred['prediction'] == df_pred['actual_val']

In [133]:
df_pred

Unnamed: 0,probability,prediction,actual_val,is_correct
0,0.009188,0,0,True
1,0.201719,0,0,True
2,0.210178,0,0,True
3,0.542262,1,1,True
4,0.210844,0,0,True
...,...,...,...,...
1404,0.319738,0,0,True
1405,0.040036,0,1,False
1406,0.137172,0,0,True
1407,0.800152,1,1,True


Checking the accuracy of the data using mean_absolute_error and mean_squared_error

In [134]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [135]:
mae = mean_absolute_error(y_valid, y_pred)
mse = mean_squared_error(y_valid, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')

MAE: 0.276645282678803
MSE: 0.1382404420506404


Test the model on the test dataset

In [140]:
# Testing the accuracy with mean_absolute_error
mae_test = mean_absolute_error(y_test, test_prediction_prob)
print(mae_test)

0.2643840833159707
