In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## <span style = 'background :lightblue'>STEP 1 : Data Collection</span>

In [2]:
filepath_telecom = r"E:\STUDY\Pianalytix\Project assignment\Telco-Customer-Churn.csv"

In [3]:
telecom = pd.read_csv(filepath_telecom , header = 0 )

In [4]:
telecom.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## <span style = 'background :lightblue'>STEP 2 : FE</span>

In [5]:
df = telecom.copy()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
def tenure_attr(DF):
    
    DF['tenure'] = DF['tenure'].replace(to_replace = [0] , value = [1])
    DF['tenure'] = np.ceil(DF['tenure']/12 ).astype(np.int64).astype(str)
    
    return DF

In [8]:
def sen_citz_encode(DF):
    
    DF['SeniorCitizen'] = DF['SeniorCitizen'].replace(to_replace = [1,0]  , value = ['Yes' ,'NO']).astype(str)
    
    return DF
  

In [9]:
def TotalCharges_to_num(DF):
    
    DF['TotalCharges'] = pd.to_numeric(DF['TotalCharges'] , errors  = 'coerce')
    
    return DF  

In [10]:
def classlabel_encode(DF):
    
    DF['Churn'] = DF['Churn'].replace(to_replace = ['Yes' ,'No'] , value = [1,0]).astype(np.int64)
    
    return DF

In [11]:
def FE(DataFrame):
    
    df = tenure_attr(DF = DataFrame)
    df = sen_citz_encode(DF = DataFrame )
    df = TotalCharges_to_num(DF = DataFrame)
    df = classlabel_encode(DF = DataFrame)
    
    return df
 

In [12]:
df = FE(DataFrame=df)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   object 
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   object 
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [14]:
# df['SeniorCitizen'].value_counts()

In [15]:
# df['tenure'].value_counts()

In [16]:
# df['Churn'].value_counts()

## <span style = 'background :lightblue'>STEP 3 : Package importing</span>

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline                
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

## <span style = 'background :lightblue'>STEP 4 : Pipeline</span>

In [18]:
df2 = df.copy()

In [19]:
def num_cat_col_list(DF) : 
    
    from pandas.api.types import is_string_dtype, is_numeric_dtype

    
    num_list = []
    cat_list = []

    for column in DF:

        if is_numeric_dtype(DF[column]):
            num_list.append(column)

        elif is_string_dtype(DF[column]):
            cat_list.append(column)

    return (num_list , cat_list)

In [20]:
num_list , cat_list = num_cat_col_list(DF = df2)

print("Numerical column : ",num_list)
print()
print("Categorical column : ",cat_list)

Numerical column :  ['MonthlyCharges', 'TotalCharges', 'Churn']

Categorical column :  ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


### Making changes to num_feat and cat_feat list

In [21]:
# num_list

num_feat = num_list.copy()
num_feat.remove('Churn')

cat_feat = cat_list.copy()
cat_feat.remove('customerID')

In [22]:
def pipeline_trans(DF,num_feat ,cat_feat):
    
    # Dropping column
    drop_columns = ["customerID"]
    
    # Separating Independent and dependent variables
    X  = DF.drop(labels = ['Churn' ], axis = 1)
    y  = DF['Churn']
    
    # Train-Test Split
    X_train,X_test,y_train,y_test = train_test_split(X,y , random_state= 100 ,test_size = 0.2 )
    
    # Pipeline
    ohe = OneHotEncoder()
    
    num_pipeline = Pipeline([('Imputer' , SimpleImputer(strategy='median')) ,
                             ("scaling", StandardScaler())
                            ])
    
    cat_pipeline = Pipeline([('Nominal Encoding' ,ohe)])
    
    pipeline = ColumnTransformer([("Num pipeline" , num_pipeline , num_feat),
                                ("Nominal pipeline" , cat_pipeline , cat_feat),
                                ("drop cols trans" , 'drop' ,drop_columns )
                                ])
    
    # Fit and Transformation
    X_train_tr = pipeline.fit_transform(X_train)
    X_test_tr =  pipeline.transform(X_test)
    
    # Output Columns after tranformation
    out_cols = []

    i= 0  # counter

    for col in pipeline.get_feature_names_out():
        out_cols.append(col.split("__")[-1])
        #out_cols[i] = out_cols[i].split("_")[-1]

        i += 1
    # print(out_cols)
    
    # Conversion to train and test Dataframes
    X_train_tr_df = pd.DataFrame(X_train_tr, columns= out_cols)
    X_test_tr_df = pd.DataFrame(X_test_tr , columns = out_cols)
    
    y_train.reset_index(drop = True , inplace = True)
    y_test.reset_index(drop = True , inplace = True)
    
    df_train_trans = pd.concat([X_train_tr_df,y_train] ,axis = 1 )
    df_test_trans = pd.concat([X_test_tr_df,y_test] ,axis = 1 )
    
    
    return df_train_trans,df_test_trans

In [25]:
df_train_tr ,df_test_tr = pipeline_trans(DF = df , num_feat = num_feat , cat_feat = cat_feat)

In [26]:
df_train_tr.head(3)

Unnamed: 0,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_NO,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,-0.338701,0.049061,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0
1,1.030607,0.139714,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
2,0.147022,-0.956056,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1


In [28]:
df_test_tr.head(3)

Unnamed: 0,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_NO,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,-1.457688,-0.536472,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0
1,-1.480896,-0.374264,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0
2,0.142049,0.81081,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0


In [48]:
def attr_dtype_to_int(DF):
    
    attributes = list(DF.columns)
    
    attributes.remove('MonthlyCharges')
    attributes.remove('TotalCharges')
    
    for attr in attributes :
        
        DF[attr] = pd.to_numeric(DF[attr] , errors = 'coerce' , downcast = 'integer')
        
    return DF
        

In [51]:
d_train = attr_dtype_to_int(DF = df_train_tr)
d_test = attr_dtype_to_int(DF = df_test_tr)

In [52]:
d_train.head(2)

Unnamed: 0,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_NO,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,-0.338701,0.049061,0,1,1,0,0,1,1,0,...,0,0,1,1,0,1,0,0,0,0
1,1.030607,0.139714,1,0,1,0,1,0,1,0,...,1,0,0,0,1,1,0,0,0,0


In [53]:
d_test.head(2)

Unnamed: 0,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_NO,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,-1.457688,-0.536472,0,1,1,0,0,1,1,0,...,0,0,1,1,0,0,0,0,1,0
1,-1.480896,-0.374264,0,1,1,0,1,0,1,0,...,0,0,1,1,0,1,0,0,0,0


## <span style = 'background :lightblue'>STEP 5 : Export the DF as a .csv file.</span>

In [54]:
D_train = d_train.copy()
D_test = d_test.copy()

In [56]:
D_train.to_csv(r"Part2_test_imbalance_Train_DS.csv")
D_test.to_csv(r"Part2_test_imbalance_Test_DS.csv")