In [4]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load and preview data
df = pd.read_csv(r"C:\Users\Admin\OneDrive\Documents\Data Science Portfolio\Customer Churn\Data\Telco-Customer-Churn.csv")
print("Data Shape: ", df.shape)
print(df.head())

Data Shape:  (7043, 21)
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMov

In [6]:
# Clean missing values
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors = 'coerce')
df.dropna(inplace = True)

#Rename columns to fintech context
df.rename(columns = {
    "tenure": "months_active",
    "tenure": "months_active",
    "MonthlyCharges": "monthly_spend",
    "TotalCharges": "total_transaction_value",
    "Contract": "subscription_type",
    "PhoneService": "has_cash_card",
    "InternetService": "digital_access_level",
    "Churn": "churned"
}, inplace = True)

# Encode churn column to binary
df["churned"] = df["churned"].map({"Yes": 1, "No": 0})

#Preview cleaned data
print("Cleaned Dataset Preview:")
display(df.head())

Cleaned Dataset Preview:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,months_active,has_cash_card,MultipleLines,digital_access_level,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,subscription_type,PaperlessBilling,PaymentMethod,monthly_spend,total_transaction_value,churned
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [10]:
# Encode all categorical variables
cat_cols = df.select_dtypes(include = "object").columns.tolist()
cat_cols.remove("customerID") # exclude ID columns if present

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

#Create new feature - average spend per month
df["avg_spend_per_month"] = df["total_transaction_value"] / df["months_active"]
df["avg_spend_per_month"].fillna(0, inplace = True)

#Scale numerical features
scaler = StandardScaler()
# Fix: Use a list for multiple columns instead of tuple syntax
num_cols = ["total_transaction_value", "months_active", "avg_spend_per_month"]
df[num_cols] = scaler.fit_transform(df[num_cols])

#Save processed data
df.to_csv(r"C:\Users\Admin\OneDrive\Documents\Data Science Portfolio\Customer Churn\Data\Processed Fintech Churn.csv")

print("Feature engineering complete. Data saved for modeling.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["avg_spend_per_month"].fillna(0, inplace = True)


Feature engineering complete. Data saved for modeling.
