In [318]:
import numpy as np
import pandas as pd

### 1. Load the dataset and explore the variables.


In [319]:
df = pd.read_csv('customer_churn.csv')

In [320]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [321]:
# There are no null/0 values
# df.isna().sum()

### 2. We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.

In [322]:
data = df[['tenure','SeniorCitizen', 'MonthlyCharges','Churn']]

In [323]:
data.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1,0,29.85,No
1,34,0,56.95,No
2,2,0,53.85,Yes
3,45,0,42.3,No
4,2,0,70.7,Yes


### 3. Extract the target variable. X-y Split


+ I am going to bin the SeniorCitizen variable because I think it will be more useful 

In [324]:
y = data['Churn']
X = data.drop(['Churn'], axis=1)

### 4. Extract the independent variables and scale them.


In [325]:
X.dtypes

tenure              int64
SeniorCitizen       int64
MonthlyCharges    float64
dtype: object

### Train-test Split

In [326]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

In [327]:
X_train.isna().sum()

tenure            0
SeniorCitizen     0
MonthlyCharges    0
dtype: int64

In [328]:
X_train.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
1421,9,1,84.4
3234,24,0,19.7
2763,64,0,81.05
5981,38,0,20.2
4999,16,0,19.7


### We are gonna separate between numerical and categorical variables

In [329]:
X_train_num = X_train.select_dtypes(include = np.number)
# X_train_cat = X_train.select_dtypes(include = object)

In [330]:
X_train_num.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
1421,9,1,84.4
3234,24,0,19.7
2763,64,0,81.05
5981,38,0,20.2
4999,16,0,19.7


In [331]:
X_train_num.isna().sum()

tenure            0
SeniorCitizen     0
MonthlyCharges    0
dtype: int64

In [332]:
# X_train_num.max()

### We are going to normalize the numerical data

In [333]:
from sklearn.preprocessing import MinMaxScaler
MinMaxtransformer = MinMaxScaler().fit(X_train_num)
X_train_num_normalized = MinMaxtransformer.transform(X_train_num)
X_train_num_normalized = pd.DataFrame(X_train_num_normalized,columns=X_train_num.columns)
display(X_train_num_normalized.head())

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,0.125,1.0,0.658209
1,0.333333,0.0,0.014428
2,0.888889,0.0,0.624876
3,0.527778,0.0,0.019403
4,0.222222,0.0,0.014428


### Concatenate the treated data and the target data

In [334]:
# X_train_normalized = pd.concat([X_train_num_normalized, y_train], axis=1)  # np.concatenate()

In [335]:
X_train_num_normalized.isna().sum()

tenure            0
SeniorCitizen     0
MonthlyCharges    0
dtype: int64

In [336]:
X_train_num_normalized = X_train_num_normalized.fillna(0)

In [337]:
X_train_num_normalized.isna().sum()

tenure            0
SeniorCitizen     0
MonthlyCharges    0
dtype: int64

In [338]:
display(X_train_num_normalized)

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,0.125000,1.0,0.658209
1,0.333333,0.0,0.014428
2,0.888889,0.0,0.624876
3,0.527778,0.0,0.019403
4,0.222222,0.0,0.014428
...,...,...,...
5629,0.222222,0.0,0.310448
5630,1.000000,0.0,0.673134
5631,0.361111,0.0,0.015423
5632,0.555556,0.0,0.381592


### 5. Build the logistic regression model.


In [339]:
X.isna().sum()

tenure            0
SeniorCitizen     0
MonthlyCharges    0
dtype: int64

In [340]:
from sklearn.linear_model import LogisticRegression
classification = LogisticRegression(random_state=0, solver='saga',
                multi_class='multinomial').fit(X_train_num_normalized, y_train)

### 6. Evaluate the model


In [341]:
predictions = classification.predict(X_test)
classification.score(X_test, y_test)

0.48261178140525196