In [1]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
# Step 2: Load the cleaned data (assuming it's already cleaned in the previous steps)
data_cleaned = pd.read_csv("../data/processed/cleaned_data.csv")


In [4]:
data_cleaned.columns = data_cleaned.columns.str.strip()  # Strip leading/trailing spaces from column names


In [6]:
# Adjust the list to exclude 'Partner'
categorical_cols = ['Gender', 'Senior Citizen', 'Married', 'Phone Service', 'Internet Service']  # Updated list

# Apply LabelEncoder to each categorical column that exists in the dataset
for col in [col for col in categorical_cols if col in data_cleaned.columns]:
    data_cleaned[col] = label_enc.fit_transform(data_cleaned[col])

print("Encoded Categorical Variables:")
print(data_cleaned[categorical_cols])  # Print the encoded columns


Encoded Categorical Variables:
      Gender  Senior Citizen  Married  Phone Service  Internet Service
0          1               1        0              0                 1
1          0               1        1              1                 1
2          1               1        0              1                 1
3          0               1        1              1                 1
4          0               1        1              1                 1
...      ...             ...      ...            ...               ...
7038       0               0        0              1                 0
7039       1               0        1              1                 1
7040       0               0        1              1                 1
7041       0               0        1              0                 1
7042       1               0        0              1                 1

[7043 rows x 5 columns]


In [8]:
# Step 4: Handle missing values before scaling and encoding
# Check for missing values
missing_values = data_cleaned.isnull().sum()
print(f"Missing Values in Each Column:\n{missing_values}")

# Impute missing values (if any) using median for numerical columns and mode for categorical columns
data_cleaned['Total Charges'] = data_cleaned['Total Charges'].fillna(data_cleaned['Total Charges'].median())
data_cleaned['Monthly Charge'] = data_cleaned['Monthly Charge'].fillna(data_cleaned['Monthly Charge'].median())
data_cleaned['Tenure in Months'] = data_cleaned['Tenure in Months'].fillna(data_cleaned['Tenure in Months'].median())

# If you have categorical columns with missing values, you could do something like this:
categorical_cols_with_missing = ['Gender', 'Senior Citizen', 'Married', 'Phone Service']
for col in categorical_cols_with_missing:
    data_cleaned[col] = data_cleaned[col].fillna(data_cleaned[col].mode()[0])

# After filling missing values, check if there are any remaining missing values
missing_values_after_imputation = data_cleaned.isnull().sum()
print(f"Missing Values After Imputation:\n{missing_values_after_imputation}")


Missing Values in Each Column:
Customer ID                             0
Gender                                  0
Age                                     0
Under 30                                0
Senior Citizen                          0
Married                                 0
Dependents                              0
Number of Dependents                    0
Country                                 0
State                                   0
City                                    0
Zip Code                                0
Latitude                                0
Longitude                               0
Population                              0
Referred a Friend                       0
Number of Referrals                     0
Tenure in Months                        0
Offer                                3877
Phone Service                           0
Avg Monthly Long Distance Charges       0
Multiple Lines                          0
Internet Service                        0
Int

In [9]:
# Step 5: Scale numerical variables using StandardScaler
# Numerical columns to scale
numerical_cols = ['Tenure in Months', 'Monthly Charge', 'Total Charges']

# Initialize StandardScaler
scaler = StandardScaler()

# Apply StandardScaler to the numerical columns
data_cleaned[numerical_cols] = scaler.fit_transform(data_cleaned[numerical_cols])

# Check scaled values
print("Scaled Numerical Variables:")
print(data_cleaned[numerical_cols].head())


Scaled Numerical Variables:
   Tenure in Months  Monthly Charge  Total Charges
0         -1.278988       -0.834611      -0.988823
1         -0.993743        0.528063      -0.726848
2         -0.586250        1.019955      -0.232929
3         -0.301005        1.121324       0.103315
4          0.187986        0.390134       0.259379


In [10]:
# Step 6: Feature Engineering - Create additional features (optional but can be useful)
# Example: Create a 'Total Spend' feature by multiplying 'Tenure in Months' and 'Monthly Charge'
data_cleaned['Total Spend'] = data_cleaned['Tenure in Months'] * data_cleaned['Monthly Charge']


In [11]:
# Step 7: Split the dataset into features (X) and target variable (y)
# Assuming the target column is 'Churn Label'
X = data_cleaned.drop(columns=['Churn Label'])
y = data_cleaned['Churn Label']


In [12]:
# Step 7: Split the dataset into features (X) and target variable (y)
# Assuming the target column is 'Churn Label'
X = data_cleaned.drop(columns=['Churn Label'])
y = data_cleaned['Churn Label']


In [13]:
# Step 8: Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Step 9: Save the processed and scaled data
scaled_data_path = "../data/processed/scaled_data.csv"
data_cleaned.to_csv(scaled_data_path, index=False)


In [15]:
# Step 10: Save the train-test split data as well (optional for future use)
train_data_path = "../data/processed/train_data.csv"
test_data_path = "../data/processed/test_data.csv"
X_train.to_csv(train_data_path, index=False)
X_test.to_csv(test_data_path, index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)


In [16]:
# Step 11: Print final confirmation and some sample data
print(f"\nProcessed data saved to {scaled_data_path}")
print(f"Training and Testing data saved to {train_data_path} and {test_data_path}")
print("\nSample of the final processed data:")
print(data_cleaned.head())



Processed data saved to ../data/processed/scaled_data.csv
Training and Testing data saved to ../data/processed/train_data.csv and ../data/processed/test_data.csv

Sample of the final processed data:
  Customer ID  Gender  Age Under 30  Senior Citizen  Married Dependents  \
0  8779-QRDMV       1   78       No               1        0         No   
1  7495-OOKFY       0   74       No               1        1        Yes   
2  1658-BYGOY       1   71       No               1        0        Yes   
3  4598-XLKNJ       0   78       No               1        1        Yes   
4  4846-WHAFZ       0   80       No               1        1        Yes   

   Number of Dependents        Country       State  ...  \
0                     0  United States  California  ...   
1                     1  United States  California  ...   
2                     3  United States  California  ...   
3                     1  United States  California  ...   
4                     1  United States  California  ..