<a href="https://colab.research.google.com/github/Riju0045/ISI-codes/blob/main/churnmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE


In [None]:
file_path = '/content/drive/MyDrive/internship project/final_data_updated.xlsx'
df = pd.read_excel(file_path)

df.head()


In [None]:
df['Churn'] = df['Refunded Amount'].apply(lambda x: 1 if x > 0 else 0)

df['Churn'].value_counts()


In [None]:
X = df.drop(['Churn', 'Refunded Amount'], axis=1)
y = df['Churn']


In [None]:
categorical_cols = ['Shipping Method', 'Payment Method', 'Source']


X_selected = X.copy()


X_encoded = pd.get_dummies(X_selected, columns=categorical_cols)


if 'Vendor' in X_encoded.columns:
    X_encoded = X_encoded.drop(columns=['Vendor'])


In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


X = X_encoded
y = df['Churn']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)


print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_resampled.value_counts())


In [None]:
non_numeric = X_train.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric)


In [None]:
X_train = X_train.drop(columns=non_numeric)
X_test = X_test.drop(columns=non_numeric)


In [None]:
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

y_probs = model.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_probs)

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label='ROC Curve')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC-ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

print("AUC Score:", roc_auc_score(y_test, y_probs))


In [None]:
import joblib

joblib.dump(model, 'churn_model.pkl')


In [None]:
loaded_model = joblib.load('churn_model.pkl')

y_pred_new = loaded_model.predict(X_test)


In [None]:
joblib.dump(X_train.columns.tolist(), 'feature_columns.pkl')


In [None]:
y_probs = model.predict_proba(X_test)[:, 1]

threshold = 0.3

y_pred_custom = (y_probs >= threshold).astype(int)

from sklearn.metrics import classification_report, accuracy_score, recall_score

print(f" Accuracy: {accuracy_score(y_test, y_pred_custom):.4f}")
print(f" Recall (Churned customers): {recall_score(y_test, y_pred_custom):.4f}")

print("\n Classification Report:")
print(classification_report(y_test, y_pred_custom))



In [None]:
X_test_with_preds = X_test.copy()
X_test_with_preds['Actual_Churn'] = y_test.values
X_test_with_preds['Predicted_Churn'] = y_pred_custom
X_test_with_preds['Churn_Probability'] = y_probs


In [None]:
churned_customers = X_test_with_preds[X_test_with_preds['Predicted_Churn'] == 1]
print(" High-risk churn customers:\n", churned_customers.head())


In [None]:
from sklearn.metrics import accuracy_score, recall_score, classification_report

y_probs = model.predict_proba(X_test)[:, 1]

threshold = 0.2
y_pred_custom = (y_probs >= threshold).astype(int)

print(f" Accuracy: {accuracy_score(y_test, y_pred_custom):.4f}")
print(f" Recall (Churned customers): {recall_score(y_test, y_pred_custom):.4f}")
print("\n Classification Report:")
print(classification_report(y_test, y_pred_custom))


In [None]:
X_test_with_preds = X_test.copy()
X_test_with_preds['Actual_Churn'] = y_test.values
X_test_with_preds['Predicted_Churn'] = y_pred_custom
X_test_with_preds['Churn_Probability'] = y_probs

X_test_with_preds.to_csv('churn_predictions_threshold_0.2.csv', index=False)
print(" Saved: churn_predictions_threshold_0.2.csv")
