In [32]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.pandas.set_option("display.max_columns", None)
from pathlib import Path

df = pd.read_csv(r"C:\Users\utkar\Desktop\ProjectX\ProjectX\Notebook\Telco_Customer_Churn.csv")



# Print shape of dataset
print(df.shape)

(7043, 21)


In [33]:
##these are the features with nan value
features_with_na=[features for features in df.columns if df[features].isnull().sum()>=1]
for feature in features_with_na:
    print(feature,np.round(df[feature].isnull().mean()*100,5), '% missing values')

In [34]:
features_with_na

[]

In [35]:

df.duplicated().sum()

np.int64(0)

In [36]:
df.columns.tolist()

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [37]:
# To drop multiple columns in-place
df.drop(['customerID', 'PhoneService','gender','TotalCharges','StreamingMovies'], inplace=True, axis=1)

In [38]:

df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn
0,0,Yes,No,1,No phone service,DSL,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,29.85,No
1,0,No,No,34,No,DSL,Yes,No,Yes,No,No,One year,No,Mailed check,56.95,No
2,0,No,No,2,No,DSL,Yes,Yes,No,No,No,Month-to-month,Yes,Mailed check,53.85,Yes
3,0,No,No,45,No phone service,DSL,Yes,No,Yes,Yes,No,One year,No,Bank transfer (automatic),42.3,No
4,0,No,No,2,No,Fiber optic,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,Yes


In [39]:
df['SeniorCitizen'] = df['SeniorCitizen'].astype('object')

In [40]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

Num of Numerical Features : 2


In [41]:
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', cat_features)

Num of Categorical Features : ['SeniorCitizen', 'Partner', 'Dependents', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']


In [42]:
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :',len(discrete_features))

Num of Discrete Features : 0


In [43]:

continuous_features=[feature for feature in num_features if feature not in discrete_features]
print('Num of Continuous Features :',len(continuous_features))

Num of Continuous Features : 2


In [44]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [45]:
y.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: Churn, dtype: object

In [46]:
# If the target column has No it is encoded as 1 others as 0
y= np.where(y=='No', 1,0)

In [47]:
y

array([1, 1, 0, ..., 1, 0, 1], shape=(7043,))

In [48]:
X.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges
0,0,Yes,No,1,No phone service,DSL,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,29.85
1,0,No,No,34,No,DSL,Yes,No,Yes,No,No,One year,No,Mailed check,56.95
2,0,No,No,2,No,DSL,Yes,Yes,No,No,No,Month-to-month,Yes,Mailed check,53.85
3,0,No,No,45,No phone service,DSL,Yes,No,Yes,Yes,No,One year,No,Bank transfer (automatic),42.3
4,0,No,No,2,No,Fiber optic,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7


In [49]:
for features in cat_features:
    print(features,':',df[features].unique())

SeniorCitizen : [0 1]
Partner : ['Yes' 'No']
Dependents : ['No' 'Yes']
MultipleLines : ['No phone service' 'No' 'Yes']
InternetService : ['DSL' 'Fiber optic' 'No']
OnlineSecurity : ['No' 'Yes' 'No internet service']
OnlineBackup : ['Yes' 'No' 'No internet service']
DeviceProtection : ['No' 'Yes' 'No internet service']
TechSupport : ['No' 'Yes' 'No internet service']
StreamingTV : ['No' 'Yes' 'No internet service']
Contract : ['Month-to-month' 'One year' 'Two year']
PaperlessBilling : ['Yes' 'No']
PaymentMethod : ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn : ['No' 'Yes']


In [50]:
df['SeniorCitizen'].unique()

array([0, 1], dtype=object)

In [51]:
num_features = list(X.select_dtypes(exclude="object").columns)

In [52]:
num_features

['tenure', 'MonthlyCharges']

In [53]:
df.columns.tolist()

['SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'Churn']

In [54]:
# Assuming you have your dataframe 'df' loaded
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

# Define column lists
or_columns = ['Contract', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'Partner',
 'Dependents','PaperlessBilling','TechSupport', 'StreamingTV']
oh_columns = ['MultipleLines', 'InternetService', 'PaymentMethod']
num_features = ['tenure', 'SeniorCitizen','MonthlyCharges']  # SeniorCitizen is already 0/1

 


# Create transformers
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first', sparse_output=False)
ordinal_encoder = OrdinalEncoder()

# Create ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("OneHotEncoder", oh_transformer, oh_columns),
        ("Ordinal_Encoder", ordinal_encoder, or_columns),
        ("StandardScaler", numeric_transformer, num_features)
    ],
    remainder='drop'  # or 'passthrough' if you want to keep other columns
)



In [55]:
X = preprocessor.fit_transform(X)

In [56]:
X

array([[ 1.        ,  0.        ,  0.        , ..., -1.27744458,
        -0.43991649, -1.16032292],
       [ 0.        ,  0.        ,  0.        , ...,  0.06632742,
        -0.43991649, -0.25962894],
       [ 0.        ,  0.        ,  0.        , ..., -1.23672422,
        -0.43991649, -0.36266036],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.87024095,
        -0.43991649, -1.1686319 ],
       [ 0.        ,  1.        ,  1.        , ..., -1.15528349,
         2.27315869,  0.32033821],
       [ 0.        ,  0.        ,  1.        , ...,  1.36937906,
        -0.43991649,  1.35896134]], shape=(7043, 19))

In [57]:
from sklearn.preprocessing import StandardScaler
final_scaler = StandardScaler()
X = final_scaler.fit_transform(X)

In [58]:
from imblearn.combine import SMOTETomek, SMOTEENN

# Resampling the minority class. The strategy can be changed as required.
smt = SMOTEENN(random_state=42,sampling_strategy='minority' )
# Fit the model to generate the data.
X_res, y_res = smt.fit_resample(X, y)