In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("Preprocessed_Telco_Churn.csv")
print("‚úÖ Dataset Loaded:", df.shape)
df.head()

‚úÖ Dataset Loaded: (7043, 26)


Unnamed: 0,customerID,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,Churn,Column1,gender_Male,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,0,-1.277445,0,No phone service,No,Yes,No,...,0,,False,False,False,False,False,False,True,False
1,5575-GNVDE,0,0,0,0.066327,1,No,Yes,No,Yes,...,0,,True,False,False,True,False,False,False,True
2,3668-QPYBK,0,0,0,-1.236724,1,No,Yes,Yes,No,...,1,,True,False,False,False,False,False,False,True
3,7795-CFOCW,0,0,0,0.514251,0,No phone service,Yes,No,Yes,...,0,,True,False,False,True,False,False,False,False
4,9237-HQITU,0,0,0,-1.236724,1,No,No,No,No,...,1,,False,True,False,False,False,False,True,False


In [3]:
# missing values
print("üîç Missing Values:")
print(df.isna().sum().sort_values(ascending=False).head(10))

üîç Missing Values:
Column1           7043
customerID           0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
OnlineSecurity       0
OnlineBackup         0
SeniorCitizen        0
dtype: int64


In [4]:
#data types
print("\nüìä Data Types:")
print(df.dtypes.head(10))


üìä Data Types:
customerID           object
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure              float64
PhoneService          int64
MultipleLines        object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
dtype: object


In [5]:
# Encode all categorical features
le = LabelEncoder()
cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

print("‚úÖ All categorical columns encoded successfully!")

‚úÖ All categorical columns encoded successfully!


In [6]:
df.head()

Unnamed: 0,customerID,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,Churn,Column1,gender_Male,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,5375,0,1,0,-1.277445,0,1,0,2,0,...,0,,False,False,False,False,False,False,True,False
1,3962,0,0,0,0.066327,1,0,2,0,2,...,0,,True,False,False,True,False,False,False,True
2,2564,0,0,0,-1.236724,1,0,2,2,0,...,1,,True,False,False,False,False,False,False,True
3,5535,0,0,0,0.514251,0,1,2,0,2,...,0,,True,False,False,True,False,False,False,False
4,6511,0,0,0,-1.236724,1,0,0,0,0,...,1,,False,True,False,False,False,False,True,False


In [7]:
# Define features and target
x = df.drop(columns=['Churn', 'customerID'], errors='ignore')
y = df['Churn']

# Train Random Forest for feature importance
rf = RandomForestClassifier(random_state=42)
rf.fit(x, y)

# Compute feature importances
importances = pd.Series(rf.feature_importances_, index=x.columns).sort_values(ascending=False)
top_features = importances.head(10)

# Display top features
print("Top Features Influencing Churn:")
print(top_features)


Top Features Influencing Churn:
TotalCharges                      0.194886
MonthlyCharges                    0.169107
tenure                            0.165403
OnlineSecurity                    0.045199
TechSupport                       0.040564
InternetService_Fiber optic       0.038514
PaymentMethod_Electronic check    0.033012
Contract_Two year                 0.029331
gender_Male                       0.028432
PaperlessBilling                  0.025533
dtype: float64


In [8]:
# Estimate Customer Lifetime Value (LTV)
df['LTV'] = df['MonthlyCharges'] * df['tenure']

# Define high-value and at-risk segments
high_value = df['LTV'] > df['LTV'].quantile(0.75)
at_risk = df['Churn'] == 1

# Filter high-value customers who are also churning
high_value_risk = df[high_value & at_risk]

print("High-Value Customers at Risk of Churn: {len(high_value_risk)}")
display(high_value_risk[['customerID', 'tenure', 'MonthlyCharges', 'LTV']].head(10))


High-Value Customers at Risk of Churn: {len(high_value_risk)}


Unnamed: 0,customerID,tenure,MonthlyCharges,LTV
20,6207,-1.277445,-0.834611,1.066169
22,723,-1.277445,-1.482712,1.894082
27,6119,-1.277445,-1.14869,1.467388
97,173,-1.114563,-1.452799,1.619236
104,2233,1.45082,1.503538,2.181362
110,327,0.921455,1.063161,0.979656
133,2158,-1.277445,-0.974202,1.244489
180,4476,-1.277445,-1.156999,1.478003
185,692,-1.277445,-1.328164,1.696656
252,6700,-1.277445,-0.816331,1.042818
