In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# import the dataset
data = pd.read_csv('datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.columns

No missing values in this dataset.
TotalCharges is a numeric column, but it is stored as an object. We need to remove the rows with spaces
label encoder and one hot encoders before splitting the data
need to scale 3 features (done after splitting)

In [None]:
# the total charges column has spaces in it, remove the spaces from the strings
data['TotalCharges'] = data['TotalCharges'].replace(' ', np.nan)
data.dropna(inplace=True)
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

In [None]:
for col in data.columns:
    print(col, data[col].unique())

In [None]:
data.drop('customerID', axis=1, inplace=True)
data.head()

## encoding the categorical data

In [None]:
def label_encode_features(dataFrame, features, label_encoder = LabelEncoder()):
    for feature in features:
        dataFrame[feature] = label_encoder.fit_transform(dataFrame[feature])
    return dataFrame

In [None]:
def one_hot_encode_features(dataFrame, feature_names, one_hot_encoder = OneHotEncoder(sparse_output=False)):
    for feature in feature_names:
        encoded = one_hot_encoder.fit_transform(dataFrame[feature].values.reshape(-1, 1)).astype(np.int64)
        encoded_df = pd.DataFrame(encoded)
        encoded_df.columns = [feature + '_' + str(i) for i in range(encoded.shape[1])]
        encoded_df.index = dataFrame.index
        dataFrame = dataFrame.drop(feature, axis=1)
        dataFrame = pd.concat([dataFrame, encoded_df], axis=1)
    return dataFrame

In [None]:
label_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', "Churn"]
one_hot_features = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

In [None]:
data = label_encode_features(data, label_features)
data = one_hot_encode_features(data, one_hot_features)

In [None]:
data.head()

## Removing Outliers

In [None]:
# visualize the data
import warnings
import seaborn as sns

warnings.filterwarnings('ignore')
sns.distplot(X['tenure'])
plt.show()
sns.distplot(X['MonthlyCharges'])
plt.show()
sns.distplot(X['TotalCharges'])
plt.show()

In [None]:
def remove_outliers(dataFrame, columns):
    for column in columns:
        std = dataFrame[column].std()
        lower_bound = -3 * std
        upper_bound = 3 * std
        dataFrame = dataFrame[(dataFrame[column] >= lower_bound) & (dataFrame[column] <= upper_bound)]
    return dataFrame

data = remove_outliers(data, ['tenure', 'MonthlyCharges', 'TotalCharges'])
data.shape

# Test and Training split

In [None]:
X = data.drop(['Churn'], axis=1)
y = pd.DataFrame(data['Churn'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=74, stratify=y)

## scaling the data

In [None]:
X_train.describe()

In [None]:
# scale X
scaler = StandardScaler()
numeric_data = ['tenure', 'MonthlyCharges', 'TotalCharges']
X_train[numeric_data] = scaler.fit_transform(X_train[numeric_data])


In [None]:
X_train.isnull().sum()

# training 
## training and validation split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=74, stratify=y_train)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy_score(y_val, y_pred)
