### Step 1: Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
url = './churn.csv'
df = pd.read_csv(url)

print("Dataset loaded successfully.")
df.head()

Dataset loaded successfully.


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Step 2: Data Preprocessing

In [2]:
# Drop customerID as it's just an identifier
df_processed = df.drop('customerID', axis=1)

# Convert TotalCharges to numeric, coercing errors to NaN
df_processed['TotalCharges'] = pd.to_numeric(df_processed['TotalCharges'], errors='coerce')
# Impute missing values with the median
df_processed['TotalCharges'].fillna(df_processed['TotalCharges'].median(), inplace=True)

# Ensure SeniorCitizen is treated as a categorical feature
df_processed['SeniorCitizen'] = df_processed['SeniorCitizen'].astype(str)

# Define features (X) and target (y)
X = df_processed.drop('Churn', axis=1)
y = df_processed['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

print("Preprocessing complete. Feature and target variables are ready.")
X.info()

Preprocessing complete. Feature and target variables are ready.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   object 
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 no

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['TotalCharges'].fillna(df_processed['TotalCharges'].median(), inplace=True)


### Step 3: Define Preprocessing Pipeline


In [3]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=np.number).columns

# Create the preprocessing pipeline for scaling numerical and encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

print("Preprocessing pipeline created.")

Preprocessing pipeline created.


In [4]:
### Step 4: Split Data, Train Model, and Build Final Pipeline

In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Create the full pipeline by combining the preprocessor and the model
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Train the model
print("Training the Random Forest model...")
full_pipeline.fit(X_train, y_train)
print("Model training complete.")

Training the Random Forest model...
Model training complete.


### Step 5: Evaluate the Model

In [6]:
y_pred = full_pipeline.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.7892

Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.70      1409
weighted avg       0.78      0.79      0.78      1409



### Step 6: Save the Final Model Pipeline

This is the most important step. We save the entire pipeline (preprocessor + model) to a single file. This ensures that the same transformations are applied to new data in our app.

In [7]:
model_filename = 'churn_model.joblib'
joblib.dump(full_pipeline, model_filename)
print(f"Model pipeline saved successfully to {model_filename}")

Model pipeline saved successfully to churn_model.joblib


In [9]:
!pip install pymongo dnspython

Collecting pymongo
  Using cached pymongo-4.14.1-cp313-cp313-win_amd64.whl.metadata (22 kB)
Collecting dnspython
  Using cached dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Using cached pymongo-4.14.1-cp313-cp313-win_amd64.whl (956 kB)
Using cached dnspython-2.7.0-py3-none-any.whl (313 kB)
Installing collected packages: dnspython, pymongo

   ---------------------------------------- 0/2 [dnspython]
   ---------------------------------------- 0/2 [dnspython]
   ---------------------------------------- 0/2 [dnspython]
   ---------------------------------------- 0/2 [dnspython]
   ---------------------------------------- 0/2 [dnspython]
   ---------------------------------------- 0/2 [dnspython]
   ---------------------------------------- 0/2 [dnspython]
   ---------------------------------------- 0/2 [dnspython]
   ---------------------------------------- 0/2 [dnspython]
   -------------------- ------------------- 1/2 [pymongo]
   -------------------- ------------------- 1/2 [pymon

In [11]:
# Import the necessary library
from pymongo import MongoClient

# Paste your connection string here# Paste the string you copied from Atlas here
MONGO_URI = "mongodb://localhost:27017/"
# MONGO_URI = "mongodb://localhost:27017/"

# Create a connection client
try:
    client = MongoClient(MONGO_URI)
    # The ping command is a simple way to test if the connection is successful
    client.admin.command('ping')
    print("✅ Connection to MongoDB successful!")
except Exception as e:
    print(f"❌ Error connecting to MongoDB: {e}")

✅ Connection to MongoDB successful!
