In [1]:
# 1. Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

print("Libraries imported successfully.")

# 2. Load the Dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print("Dataset loaded successfully.")

# 3. Preprocess the Data
# Drop customerID as it is not a predictive feature
df = df.drop('customerID', axis=1)

# Convert 'TotalCharges' to a numeric type, coercing errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# Fill any resulting NaN values (for new customers) with the median
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Convert binary categorical variables to 0s and 1s for the model
# Using a loop for efficiency
for col in ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 1 if x == 'Yes' or x == 'Male' else 0)

# Use one-hot encoding for multi-category columns. This creates new columns
# for each category, which is essential for the model to understand them.
df = pd.get_dummies(df, columns=[
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaymentMethod'
], drop_first=True)

print("Data preprocessing complete.")

# 4. Define Features (X) and Target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# 5. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Initialize and Train the Random Forest Model
# n_estimators=100 means the model is built from 100 decision trees
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete.")

# (Optional) Check the model's accuracy on the test data
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

# 7. Save the Trained Model and the Column List
# This is the most crucial step for our app
joblib.dump(model, 'churn_model.pkl')
model_columns = list(X.columns)
joblib.dump(model_columns, 'churn_model_columns.pkl')

print("Model and columns have been saved successfully!")
print("The files 'churn_model.pkl' and 'churn_model_columns.pkl' are now in your project folder.")

Libraries imported successfully.
Dataset loaded successfully.
Data preprocessing complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


Model training complete.
Model Accuracy: 0.79
Model and columns have been saved successfully!
The files 'churn_model.pkl' and 'churn_model_columns.pkl' are now in your project folder.


In [4]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test data (if you haven't already)
y_pred = model.predict(X_test)

# Print the Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print the full Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Churn', 'Churn']))
# (This code goes at the end of your evaluation cell in the notebook)

report = classification_report(y_test, y_pred, target_names=['Not Churn', 'Churn'])

# Save the report to a text file
with open("classification_report.txt", "w") as file:
    file.write(report)

print("\nClassification report saved to 'classification_report.txt'")

Confusion Matrix:
[[945  91]
 [203 170]]

Classification Report:
              precision    recall  f1-score   support

   Not Churn       0.82      0.91      0.87      1036
       Churn       0.65      0.46      0.54       373

    accuracy                           0.79      1409
   macro avg       0.74      0.68      0.70      1409
weighted avg       0.78      0.79      0.78      1409


Classification report saved to 'classification_report.txt'
