# loading libraries

In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix

# Loading Dataset

In [64]:
df= pd.read_csv("DATA_Customer-Churn.csv")
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [65]:
df.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [70]:
df["TotalCharges"][488]

' '

In [73]:
#convert total charges column to numerical
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"],errors="coerce")

In [74]:
numericals= df._get_numeric_data()
numericals

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,1,29.85,29.85
1,0,34,56.95,1889.50
2,0,2,53.85,108.15
3,0,45,42.30,1840.75
4,0,2,70.70,151.65
...,...,...,...,...
7038,0,24,84.80,1990.50
7039,0,72,103.20,7362.90
7040,0,11,29.60,346.45
7041,1,4,74.40,306.60


In [75]:
#distribution of numerical data
numericals.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7032.0
mean,0.162147,32.371149,64.761692,2283.300441
std,0.368612,24.559481,30.090047,2266.771362
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.5,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.85,3794.7375
max,1.0,72.0,118.75,8684.8


In [77]:
categoricals=  df.select_dtypes(include=['object'])
categoricals

Unnamed: 0,gender,Partner,Dependents,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,Churn
0,Female,Yes,No,No,No,Yes,No,No,No,No,Month-to-month,No
1,Male,No,No,Yes,Yes,No,Yes,No,No,No,One year,No
2,Male,No,No,Yes,Yes,Yes,No,No,No,No,Month-to-month,Yes
3,Male,No,No,No,Yes,No,Yes,Yes,No,No,One year,No
4,Female,No,No,Yes,No,No,No,No,No,No,Month-to-month,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,No
7039,Female,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,One year,No
7040,Female,Yes,Yes,No,Yes,No,No,No,No,No,Month-to-month,No
7041,Male,Yes,No,Yes,No,No,No,No,No,No,Month-to-month,Yes


## hot encoding categorical features

In [78]:
categoricals_with_dummies = pd.get_dummies(categoricals, sparse=True)
categoricals_with_dummies

Unnamed: 0,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,OnlineSecurity_No,OnlineSecurity_No internet service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Churn_No,Churn_Yes
0,1,0,0,1,1,0,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0
1,0,1,1,0,1,0,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
2,0,1,1,0,1,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1
3,0,1,1,0,1,0,1,0,0,0,...,0,0,1,0,0,0,1,0,1,0
4,1,0,1,0,1,0,0,1,1,0,...,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,0,1,0,1,0,1,0,0,...,0,1,0,0,1,0,1,0,1,0
7039,1,0,0,1,0,1,0,1,1,0,...,0,1,0,0,1,0,1,0,1,0
7040,1,0,0,1,0,1,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
7041,0,1,0,1,1,0,0,1,1,0,...,0,0,1,0,0,1,0,0,0,1


## Scalling numerical features 

In [83]:
# define standard scaler
scaler = StandardScaler()
# transform data
scaled = scaler.fit_transform(numericals)
print(scaled)

[[-0.43991649 -1.27744458 -1.16032292 -0.99419409]
 [-0.43991649  0.06632742 -0.25962894 -0.17373982]
 [-0.43991649 -1.23672422 -0.36266036 -0.95964911]
 ...
 [-0.43991649 -0.87024095 -1.1686319  -0.85451414]
 [ 2.27315869 -1.15528349  0.32033821 -0.87209546]
 [-0.43991649  1.36937906  1.35896134  2.01234407]]


In [86]:
#splitting data into train and test 
df_train, df_test = train_test_split(df, test_size=0.25, random_state=42)

In [97]:
x=df.drop(['Churn'], axis=1)
y=['Churn']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

ValueError: Found input variables with inconsistent numbers of samples: [7043, 1]

In [98]:
x.shape

(7043, 15)

In [101]:
df['Churn']

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

In [96]:
model = LinearRegression()
model.fit(X_train, y_train)

NameError: name 'X_train' is not defined

NameError: name 'X' is not defined