# 📊 Customer Churn Prediction Project
**Author:** Priyasha Nandy  
**Skills:** Python, SQL, Machine Learning  
---
### Objective
Predict whether a customer will churn (leave) based on demographics, contract type, and usage behavior.


In [ ]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [ ]:
# Load dataset
df = pd.read_csv('Telco-Customer-Churn.csv')
print('Dataset shape:', df.shape)
df.head()

In [ ]:
# Data Cleaning
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna()
df = df.drop('customerID', axis=1)

le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])

df = pd.get_dummies(df, drop_first=True)
df.shape

In [ ]:
# Split data
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

In [ ]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [ ]:
# Model Training
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [ ]:
# Evaluation
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred_lr))
print('Random Forest Accuracy:', accuracy_score(y_test, y_pred_rf))
print('\nClassification Report (Random Forest):\n', classification_report(y_test, y_pred_rf))

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [ ]:
# Feature Importance
feat_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False).head(10)

plt.figure(figsize=(8,5))
sns.barplot(x='Importance', y='Feature', data=feat_importances)
plt.title('Top 10 Important Features')
plt.show()

### 🧠 Insights Summary
1. Contract type, tenure, and monthly charges are top indicators of churn.
2. Customers on month-to-month contracts with higher charges tend to leave sooner.
3. Model accuracy (Random Forest): around **80–85%**.
