In [69]:
import pandas as pd

In [70]:
df = pd.read_csv('churn.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [71]:
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

In [72]:
df.info(
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [73]:
for column in df.select_dtypes(include=['object']).columns:
    print(f"Unique values in '{column}': {df[column].unique()}")

Unique values in 'Geography': ['France' 'Spain' 'Germany']
Unique values in 'Gender': ['Female' 'Male']


In [74]:
df.isna().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [75]:
df.duplicated().sum()

np.int64(0)

In [76]:
df['Gender'] = df['Gender'].map({'Male':1,'Female':0})

In [77]:
dummy_df = pd.get_dummies(df['Geography'], drop_first=True,dtype=int)
df = pd.concat([df, dummy_df], axis=1)
df = df.drop(columns=['Geography'])

In [78]:
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Germany,Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0,1
2,502,0,42,8,159660.8,3,1,0,113931.57,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0,1


# Feature Engineering


In [None]:
# 1. Customer Value Score (if Balance and Salary present)
if 'Balance' in df.columns and 'Salary' in df.columns:
    df['Balance_to_Salary_Ratio'] = df['Balance'] / (df['Salary'] + 1)  # +1 to avoid division by zero
    df['Is_Wealthy'] = (df['Balance'] > df['Balance'].quantile(0.75)).astype(int)

# 2. Age Group Binning
if 'Age' in df.columns:
    df['Age_Group'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 60, 100], labels=['18-30', '31-40', '41-50', '51-60', '60+'])
    df['Age_Group'] = pd.Categorical(df['Age_Group']).codes
    df['Is_Senior'] = (df['Age'] > 60).astype(int)

# 3. Tenure-based Features
if 'Tenure' in df.columns:
    df['Tenure_Group'] = pd.cut(df['Tenure'], bins=[0, 1, 3, 5, 10, float('inf')], labels=['New', 'Developing', 'Mature', 'Loyal', 'Very_Loyal'])
    df['Tenure_Group'] = pd.Categorical(df['Tenure_Group']).codes
    df['Is_New_Customer'] = (df['Tenure'] <= 1).astype(int)

# 4. Credit Score Risk
if 'CreditScore' in df.columns:
    df['Credit_Risk'] = pd.cut(df['CreditScore'], bins=[0, 400, 600, 750, 850, 1000], labels=['Very_High', 'High', 'Medium', 'Low', 'Very_Low'])
    df['Credit_Risk'] = pd.Categorical(df['Credit_Risk']).codes

print("Feature Engineering Applied!")
print(f"New features created. Shape: {df.shape}")
df.head()


In [None]:
# 5. Product Engagement Features (if product-related columns exist)
product_cols = [col for col in df.columns if 'Product' in col or 'Service' in col]
if product_cols:
    df['Num_Products'] = df[product_cols].sum(axis=1) if product_cols else 0
    df['Has_Multiple_Products'] = (df['Num_Products'] > 1).astype(int)

# 6. Customer Activity Interaction Features
if 'IsActiveMember' in df.columns and 'Tenure' in df.columns:
    df['Activity_Tenure_Score'] = df['IsActiveMember'] * df['Tenure']

# 7. Age and Product Interaction
if 'Age' in df.columns and 'Num_Products' in df.columns:
    df['Age_Product_Interaction'] = df['Age'] * df['Num_Products']

# 8. Balance Risk Score
if 'Balance' in df.columns:
    df['Zero_Balance'] = (df['Balance'] == 0).astype(int)
    df['Low_Balance'] = (df['Balance'] < df['Balance'].quantile(0.25)).astype(int)

# 9. Customer Segment Creation
if 'Age' in df.columns and 'Balance' in df.columns:
    df['Customer_Segment'] = pd.qcut(df['Balance'], q=3, labels=['Low_Value', 'Medium_Value', 'High_Value'], duplicates='drop')
    df['Customer_Segment'] = pd.Categorical(df['Customer_Segment']).codes

print("Advanced Feature Engineering Complete!")
print(f"Final dataset shape: {df.shape}")
print(f"\nNew columns added: {df.shape[1] - 10}")  # Approximate
df.head()


In [79]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


In [80]:
X = df.drop(columns=['Exited'])
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
lr_pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
lr_pipeline.fit(X_train, y_train)

y_pred = lr_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.55      0.20      0.29       393

    accuracy                           0.81      2000
   macro avg       0.69      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000


Confusion Matrix:
[[1543   64]
 [ 314   79]]


In [82]:

svm_pipeline = make_pipeline(StandardScaler(), SVC(kernel='rbf', random_state=42))
svm_pipeline.fit(X_train, y_train)

y_pred_svm = svm_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1607
           1       0.77      0.38      0.51       393

    accuracy                           0.86      2000
   macro avg       0.82      0.68      0.71      2000
weighted avg       0.85      0.86      0.84      2000


Confusion Matrix:
[[1562   45]
 [ 243  150]]


In [83]:
rf_pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42))
rf_pipeline.fit(X_train, y_train)

y_pred_rf = rf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1607
           1       0.76      0.46      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000


Confusion Matrix:
[[1551   56]
 [ 211  182]]


In [84]:
from sklearn.ensemble import GradientBoostingClassifier

gb_pipeline = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=42))
gb_pipeline.fit(X_train, y_train)

y_pred_gb = gb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_gb))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.49      0.59       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.76      2000
weighted avg       0.86      0.87      0.86      2000


Confusion Matrix:
[[1543   64]
 [ 201  192]]
