# Engineering

## Importing the required Libraries

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib
from scipy import sparse
import os
from dotenv import load_dotenv

# Loading the .env files
load_dotenv()

True

## Load the cleaned data

In [2]:
clean_path = os.getenv('TELCO_DFCLEAN_PATH')
df = pd.read_csv(clean_path)

# Setting the amount of columns shown
pd.set_option('display.max_columns', None)

# Load the DataFrame
df.head()



Unnamed: 0,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents,City,Zip Code,Number of Referrals,Tenure in Months,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,Internet Service,Avg Monthly GB Download,Online Security,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Contract,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Satisfaction Score,Churn Label,Churn Score,CLTV
0,Male,78,No,Yes,No,No,0,Los Angeles,90022,0,1,No,0.0,No,Yes,8,No,No,Yes,No,No,Yes,No,No,Month-to-Month,39.65,39.65,0.0,20,0.0,59.65,3,Yes,91,5433
1,Female,74,No,Yes,Yes,Yes,1,Los Angeles,90063,1,8,Yes,48.85,Yes,Yes,17,No,Yes,No,No,No,No,No,Yes,Month-to-Month,80.65,633.3,0.0,0,390.8,1024.1,3,Yes,69,5302
2,Male,71,No,Yes,No,Yes,3,Los Angeles,90065,0,18,Yes,11.33,Yes,Yes,52,No,No,No,No,Yes,Yes,Yes,Yes,Month-to-Month,95.45,1752.55,45.61,0,203.94,1910.88,2,Yes,81,3179
3,Female,78,No,Yes,Yes,Yes,1,Inglewood,90303,1,25,Yes,19.76,No,Yes,12,No,Yes,Yes,No,Yes,Yes,No,Yes,Month-to-Month,98.5,2514.5,13.43,0,494.0,2995.07,2,Yes,88,5337
4,Female,80,No,Yes,Yes,Yes,1,Whittier,90602,1,37,Yes,6.33,Yes,Yes,14,No,No,No,No,No,No,No,Yes,Month-to-Month,76.5,2868.15,0.0,0,234.21,3102.36,2,Yes,67,2793


## Identifying and Splitting the Features and the Target

In [3]:
# Identifying the Features and saving it to the 'X' Variable 
X = df.drop(columns=['Churn Label'])

# Identifying the Target Value and saving it to the 'y' Variable
y = df['Churn Label']

## Preprocessing Data

In [4]:
# Identifying numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
print(numerical_features)
categorical_features = X.select_dtypes(include=['object']).columns
print(categorical_features)
# Creating the pipelines for the numerical data features
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())                 # Standardize features by removing the mean and scaling to unit variance
])

# Creating the pipelines for the categorical data features
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode categorical variables
])

# Combining both pipelines into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
    ]
)

# Fit and transform the features
X_processed = preprocessor.fit_transform(X)

# Turning X_processed (Sparse Matrix) to an Array
X_processed = X_processed.toarray()

# Convert X_processed to DataFrame
X_processed_df = pd.DataFrame(X_processed)

# # Saving the preprocessor 
preprocessor_path = os.getenv('PREPROCESSOR_MODEL_PATH')
joblib.dump(preprocessor, preprocessor_path)

# Save the DataFrame to a CSV file
processed_df = pd.concat([X_processed_df, pd.DataFrame(y, columns=['Churn Label'])], axis=1)
processed_df_path = os.getenv('TELCO_DFPROC_PATH')
processed_df.to_csv(processed_df_path, index=False)

Index(['Age', 'Number of Dependents', 'Zip Code', 'Number of Referrals',
       'Tenure in Months', 'Avg Monthly Long Distance Charges',
       'Avg Monthly GB Download', 'Monthly Charge', 'Total Charges',
       'Total Refunds', 'Total Extra Data Charges',
       'Total Long Distance Charges', 'Total Revenue', 'Satisfaction Score',
       'Churn Score', 'CLTV'],
      dtype='object')
Index(['Gender', 'Under 30', 'Senior Citizen', 'Married', 'Dependents', 'City',
       'Phone Service', 'Multiple Lines', 'Internet Service',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract'],
      dtype='object')
