In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

In [5]:
file_path = 'abc.xlsx'  
data = pd.read_excel(file_path)

In [7]:
data = data.drop(['customer_id', 'phone_no'], axis=1)

# Encode categorical columns (e.g., 'gender', 'multi_screen', 'mail_subscribed')
label_encoders = {}
for column in ['gender', 'multi_screen', 'mail_subscribed']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

In [9]:
# Define features (X) and target variable (y)
X = data.drop('churn', axis=1)
y = data['churn']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Initialize the Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_clf.fit(X_train, y_train)

# Make predictions
y_pred = rf_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [13]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.91
Precision: 0.63
Recall: 0.55
F1 Score: 0.59


In [15]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[338  15]
 [ 21  26]]


In [17]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       353
           1       0.63      0.55      0.59        47

    accuracy                           0.91       400
   macro avg       0.79      0.76      0.77       400
weighted avg       0.91      0.91      0.91       400



In [19]:
# Feature Importance (Optional)
importances = rf_clf.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print("\nFeature Importances:")
print(feature_importance)


Feature Importances:
                   Feature  Importance
6      weekly_mins_watched    0.170665
8       maximum_daily_mins    0.157624
12  customer_support_calls    0.128985
7       minimum_daily_mins    0.095286
4             multi_screen    0.083825
10          videos_watched    0.076954
3    no_of_days_subscribed    0.076036
2                      age    0.072979
9    weekly_max_night_mins    0.069360
5          mail_subscribed    0.032804
11   maximum_days_inactive    0.021935
1                   gender    0.013547
0                     year    0.000000
