In [69]:
import numpy as np
import pandas as pd

### 1. Load the dataset and explore the variables.


In [70]:
df = pd.read_csv('customer_churn.csv')

In [71]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [72]:
# There are no null/0 values
# df.isna().sum()

### 2. We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.

In [73]:
data = df[['tenure','SeniorCitizen', 'MonthlyCharges','Churn']]

In [74]:
data.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1,0,29.85,No
1,34,0,56.95,No
2,2,0,53.85,Yes
3,45,0,42.3,No
4,2,0,70.7,Yes


### 3. Extract the target variable. X-y Split


+ I am going to bin the SeniorCitizen variable because I think it will be more useful 

In [75]:
data['SeniorCitizen'] = data['SeniorCitizen'].astype(object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['SeniorCitizen'] = data['SeniorCitizen'].astype(object)


In [76]:
y = data['Churn']
X = data.drop(['Churn'], axis=1)

### 4. Extract the independent variables and scale them.


In [77]:
X.dtypes

tenure              int64
SeniorCitizen      object
MonthlyCharges    float64
dtype: object

### Train-test Split

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
X_train.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
2142,21,0,64.85
1623,54,0,97.2
6074,1,0,23.45
1362,4,0,70.2
6754,0,0,61.9


### We are gonna separate between numerical and categorical variables

In [80]:
X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)

In [81]:
# X_train_num.max()

In [88]:
from sklearn.preprocessing import MinMaxScaler
MinMaxtransformer = MinMaxScaler().fit(X_train_num)
X_num_normalized = MinMaxtransformer.transform(X_train_num)
print(type(X_num_normalized))
X_num_normalized = pd.DataFrame(X_num_normalized,columns=X_train_num.columns)
display(X_num_normalized.head())
print(type(X_num_normalized))

<class 'numpy.ndarray'>


Unnamed: 0,tenure,MonthlyCharges
0,0.291667,0.464375
1,0.75,0.786746
2,0.013889,0.051819
3,0.055556,0.517688
4,0.0,0.434978


<class 'pandas.core.frame.DataFrame'>


In [83]:
# X_train_num.head()

In [84]:
# X_train_cat.head()

In [89]:
X = pd.concat([X_num_normalized, y], axis=1)  # np.concatenate()

In [90]:
X.head()

Unnamed: 0,tenure,MonthlyCharges,Churn
0,0.291667,0.464375,No
1,0.75,0.786746,No
2,0.013889,0.051819,Yes
3,0.055556,0.517688,No
4,0.0,0.434978,Yes
