**1. Importing the dependencies**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

**2. Data Loading and Understanding**

In [None]:
# load teh csv data to a pandas dataframe
df = pd.read_csv(r"D:\practice dataset\Telecom_churn.csv")

# New Section

In [None]:
df.shape

In [None]:
df.head()

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
# dropping customerID column as this is not required for modelling
df = df.drop(columns=["customerID"])

In [None]:
df.head(2)

In [None]:
df.columns

In [None]:
print(df["gender"].unique())

In [None]:
print(df["SeniorCitizen"].unique())

In [None]:
# printing the unique values in all the columns

numerical_features_list = ["tenure", "MonthlyCharges", "TotalCharges"]

for col in df.columns:
  if col not in numerical_features_list:
    print(col, df[col].unique())
    print("-"*50)

In [None]:
print(df.isnull().sum())

In [None]:
#df["TotalCharges"] = df["TotalCharges"].astype(float)

In [None]:
df[df["TotalCharges"]==" "]

In [None]:
len(df[df["TotalCharges"]==" "])

In [None]:
df["TotalCharges"] = df["TotalCharges"].replace({" ": "0.0"})

In [None]:
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [None]:
df.info()

In [None]:
# checking the class distribution of target column
print(df["Churn"].value_counts())

**Insights:**
1. Customer ID removed as it is not required for modelling
2. No mmissing values in the dataset
3. Missing values in the TotalCharges column were replaced with 0
4. Class imbalance identified in the target

**3. Exploratory Data Analysis (EDA)**

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head(2)

In [None]:
df.describe()

**Numerical Features - Analysis**

Understand the distribution of teh numerical features

In [None]:
def plot_histogram(df, column_name):

  plt.figure(figsize=(5, 3))
  sns.histplot(df[column_name], kde=True)
  plt.title(f"Distribution of {column_name}")

  # calculate the mean and median values for the columns
  col_mean = df[column_name].mean()
  col_median = df[column_name].median()

  # add vertical lines for mean and median
  plt.axvline(col_mean, color="red", linestyle="--", label="Mean")
  plt.axvline(col_median, color="green", linestyle="-", label="Median")

  plt.legend()

  plt.show()

In [None]:
plot_histogram(df, "tenure")

In [None]:
plot_histogram(df, "MonthlyCharges")

In [None]:
plot_histogram(df, "TotalCharges")

**Box plot for numerical features**

In [None]:
def plot_boxplot(df, column_name):

  plt.figure(figsize=(5, 3))
  sns.boxplot(y=df[column_name])
  plt.title(f"Box Plot of {column_name}")
  plt.ylabel(column_name)
  plt.show

In [None]:
plot_boxplot(df, "tenure")

In [None]:
plot_boxplot(df, "MonthlyCharges")

In [None]:
plot_boxplot(df, "TotalCharges")

**Correlation Heatmap for numerical columns**

In [None]:
# correlation matrix - heatmap
plt.figure(figsize=(8, 4))
sns.heatmap(df[["tenure", "MonthlyCharges", "TotalCharges"]].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

Categorical features - Analysis

In [None]:
df.columns

In [None]:
df.info()

Countplot for categorical columns

In [None]:
object_cols = df.select_dtypes(include="object").columns.to_list()

object_cols = ["SeniorCitizen"] + object_cols

for col in object_cols:
  plt.figure(figsize=(5, 3))
  sns.countplot(x=df[col])
  plt.title(f"Count Plot of {col}")
  plt.show()

**4. Data Preprocessing**

In [None]:
df.head(3)

Label encoding of target column

In [None]:
df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})

In [None]:
df.head(3)

In [None]:
print(df["Churn"].value_counts())

Label encoding of categorical fetaures

In [None]:
# identifying columns with object data type
object_columns = df.select_dtypes(include="object").columns

In [None]:
print(object_columns)

In [None]:
# initialize a dictionary to save the encoders
encoders = {}

# apply label encoding and store the encoders
for column in object_columns:
  label_encoder = LabelEncoder()
  df[column] = label_encoder.fit_transform(df[column])
  encoders[column] = label_encoder


# save the encoders to a pickle file
with open("encoders.pkl", "wb") as f:
  pickle.dump(encoders, f)


In [None]:
encoders

In [None]:
df.head()

**Traianing and test data split**

In [None]:
# splitting the features and target
X = df.drop(columns=["Churn"])
y = df["Churn"]

In [None]:
# split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(y_train.shape)

In [None]:
print(y_train.value_counts())

Synthetic Minority Oversampling TEchnique (SMOTE)

In [None]:
smote = SMOTE(random_state=42)

In [None]:
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
print(y_train_smote.shape)

In [None]:
print(y_train_smote.value_counts())

**5. Model Training**

Training with default hyperparameters

In [None]:
# dictionary of models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

In [None]:
# dictionary to store the cross validation results
cv_scores = {}

# perform 5-fold cross validation for each model
for model_name, model in models.items():
  print(f"Training {model_name} with default parameters")
  scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring="accuracy")
  cv_scores[model_name] = scores
  print(f"{model_name} cross-validation accuracy: {np.mean(scores):.2f}")
  print("-"*70)

In [None]:
cv_scores

Random Forest gives the highest accuracy compared to other models with default parameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Define parameter grid
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 5, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# RandomizedSearchCV for efficiency
rfc_random = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=50,                # number of random combos to try
    cv=5,                     # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1,                # use all CPU cores
    scoring='f1'              # or 'roc_auc' / 'accuracy' depending on your problem
)

# Fit to SMOTE data
rfc_random.fit(X_train_smote, y_train_smote)

# Best parameters
print("Best Parameters:", rfc_random.best_params_)

# Final optimized model
rfc = rfc_random.best_estimator_


In [None]:
print(y_test.value_counts())

**6. Model Evaluation**

In [None]:

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# evaluate on test data
y_test_pred = rfc.predict(X_test)

# Metrics
print("Accuracy Score:\n", accuracy_score(y_test, y_test_pred))
print("F1 Score:\n", f1_score(y_test, y_test_pred, average="weighted"))  # 'weighted' handles imbalance
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))

# Visualization: Confusion Matrix Heatmap
cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Predicted 0", "Predicted 1"],
            yticklabels=["Actual 0", "Actual 1"])
plt.title("Confusion Matrix Heatmap")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# save the trained model as a pickle file
model_data = {"model": rfc, "features_names": X.columns.tolist()}


with open("customer_churn_model.pkl", "wb") as f:
  pickle.dump(model_data, f)

**7. Load the saved  model and  build a Predictive System**

In [None]:
# load teh saved model and the feature names

with open("customer_churn_model.pkl", "rb") as f:
  model_data = pickle.load(f)

loaded_model = model_data["model"]
feature_names = model_data["features_names"]

In [None]:
print(loaded_model)

In [None]:
print(feature_names)

In [None]:
input_data = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 1,
    'PhoneService': 'No',
    'MultipleLines': 'No phone service',
    'InternetService': 'DSL',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'No',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 29.85,
    'TotalCharges': 29.85
}


input_data_df = pd.DataFrame([input_data])

with open("encoders.pkl", "rb") as f:
  encoders = pickle.load(f)


# encode categorical featires using teh saved encoders
for column, encoder in encoders.items():
  input_data_df[column] = encoder.transform(input_data_df[column])

# make a prediction
prediction = loaded_model.predict(input_data_df)
pred_prob = loaded_model.predict_proba(input_data_df)

print(prediction)

# results
print(f"Prediction: {'Churn' if prediction[0] == 1 else 'No Churn'}")
print(f"Prediciton Probability: {pred_prob}")

In [None]:
encoders