In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os


In [2]:
# Load the cleaned dataset
data_path = "../data/processed/cleaned_data.csv"
data_cleaned = pd.read_csv(data_path)

# Display the first few rows of the dataset to confirm it loaded correctly
data_cleaned.head()


Unnamed: 0,Customer ID,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents,Country,State,...,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Satisfaction Score,Customer Status,Churn Label,Churn Score,CLTV,Churn Category,Churn Reason
0,8779-QRDMV,Male,78,No,Yes,No,No,0,United States,California,...,20,0.0,59.65,3,Churned,Yes,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,Female,74,No,Yes,Yes,Yes,1,United States,California,...,0,390.8,1024.1,3,Churned,Yes,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,Male,71,No,Yes,No,Yes,3,United States,California,...,0,203.94,1910.88,2,Churned,Yes,81,3179,Competitor,Competitor made better offer
3,4598-XLKNJ,Female,78,No,Yes,Yes,Yes,1,United States,California,...,0,494.0,2995.07,2,Churned,Yes,88,5337,Dissatisfaction,Limited range of services
4,4846-WHAFZ,Female,80,No,Yes,Yes,Yes,1,United States,California,...,0,234.21,3102.36,2,Churned,Yes,67,2793,Price,Extra data charges


In [3]:
# Check for missing values
missing_values = data_cleaned.isnull().sum()
print(f"Missing Values in Each Column:\n{missing_values}")

# Handle missing values
# Impute missing values for numerical columns with the median
numerical_cols = ['Tenure in Months', 'Monthly Charge', 'Total Charges']  # Update if needed
for col in numerical_cols:
    data_cleaned[col].fillna(data_cleaned[col].median(), inplace=True)

# Impute missing values for categorical columns with the mode
categorical_cols = ['Gender', 'Senior Citizen', 'Married', 'Phone Service']  # Update if needed
for col in categorical_cols:
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

# Verify if there are any missing values after imputation
missing_values_after_imputation = data_cleaned.isnull().sum()
print(f"Missing Values After Imputation:\n{missing_values_after_imputation}")


Missing Values in Each Column:
Customer ID                             0
Gender                                  0
Age                                     0
Under 30                                0
Senior Citizen                          0
Married                                 0
Dependents                              0
Number of Dependents                    0
Country                                 0
State                                   0
City                                    0
Zip Code                                0
Latitude                                0
Longitude                               0
Population                              0
Referred a Friend                       0
Number of Referrals                     0
Tenure in Months                        0
Offer                                3877
Phone Service                           0
Avg Monthly Long Distance Charges       0
Multiple Lines                          0
Internet Service                        0
Int

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned[col].fillna(data_cleaned[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned[col].fillna(data_cleaned[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

In [4]:
# Encode categorical variables using LabelEncoder
label_enc = LabelEncoder()

# Encode each categorical column
for col in categorical_cols:
    data_cleaned[col] = label_enc.fit_transform(data_cleaned[col])

# Verify the encoding
print(f"Categorical columns after encoding:\n{data_cleaned[categorical_cols].head()}")


Categorical columns after encoding:
   Gender  Senior Citizen  Married  Phone Service
0       1               1        0              0
1       0               1        1              1
2       1               1        0              1
3       0               1        1              1
4       0               1        1              1


In [5]:
# Scale numerical variables using StandardScaler
scaler = StandardScaler()

# Scale the numerical columns
data_cleaned[numerical_cols] = scaler.fit_transform(data_cleaned[numerical_cols])

# Verify the scaling
print(f"Scaled numerical columns:\n{data_cleaned[numerical_cols].head()}")


Scaled numerical columns:
   Tenure in Months  Monthly Charge  Total Charges
0         -1.278988       -0.834611      -0.988823
1         -0.993743        0.528063      -0.726848
2         -0.586250        1.019955      -0.232929
3         -0.301005        1.121324       0.103315
4          0.187986        0.390134       0.259379


In [6]:
# Save the processed and scaled data
scaled_path = "../data/processed/scaled_data.csv"
os.makedirs(os.path.dirname(scaled_path), exist_ok=True)  # Ensure the directory exists
data_cleaned.to_csv(scaled_path, index=False)

print(f"Processed data saved to {scaled_path}")


Processed data saved to ../data/processed/scaled_data.csv
