# **Import Libraries**

In [1]:
import pandas as pd
import numpy as np

import plotly.express as xp
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

import pickle

# **Load Data**

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df

# **Data Preprocessing**

In [3]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [4]:
df.info()

In [5]:
df.describe()

In [6]:
df.describe(include=[object])

## Handling Missing Value

In [7]:
df.drop(columns = ['customerID'], inplace = True)

In [8]:
df

In [9]:
df.isnull().sum()

In [10]:
df.dropna(inplace=True)

In [11]:
df.shape

In [12]:
df.duplicated().sum()

## Data separation as X and y

In [13]:
y = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
y
y.value_counts()

In [14]:
X = df.drop('Churn', axis=1)
X

## Data Splitting

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train

In [17]:
X_test

## Validate Categorical Data

In [18]:
def validate_test_data_categorical_columns(train_df, test_df):
    # Get the list of categorical columns for both train and test DataFrames
    train_df_categorical_columns = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
    test_df_categorical_columns = test_df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Check if the number of categorical columns is the same in both DataFrames
    if len(set(train_df_categorical_columns).intersection(set(test_df_categorical_columns))) == 0:
        print('Train and test dataframes have different categorical columns')
        return
    else:
        for cat_col in test_df_categorical_columns:
            # Create sets of unique values for the current categorical column in both DataFrames
            train_col = set(x for x in train_df[cat_col].unique().tolist() if not pd.isna(x))
            test_col = set(x for x in test_df[cat_col].unique().tolist() if not pd.isna(x))

            # Check if the sets are not equal, indicating different unique values
            if train_col != test_col:
                print(f'{cat_col} column has different unique values in train and test data:')
                print(f'Unique values in train data: {train_col}')
                print(f'Unique values in test data: {test_col}')
                return

        print('All categorical columns have consistent unique values in train and test data.')
        return

validate_test_data_categorical_columns(X_train, X_test)

## Encoding Categorical Data

In [19]:
label_encoder = LabelEncoder()

categorical_columns = [
    'gender', 
    'Partner',
    'Dependents',
    'PhoneService',
    'PaperlessBilling' , 
    'Contract', 
    'MultipleLines', 
    'InternetService', 
    'OnlineSecurity', 
    'OnlineBackup', 
    'DeviceProtection', 
    'TechSupport', 
    'StreamingTV', 
    'StreamingMovies',
    'PaymentMethod'
]

label_encoders = {}

for column in categorical_columns:
    # Fit and transform the column
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    # Save the encoder
    label_encoders[column] = le

print("Encoded DataFrame:")
df.head()

In [20]:
print("Label Encoding Mapping:")
for column, le in label_encoders.items():
    print(f"\nColumn: {column}")
    for i, label in enumerate(le.classes_):
        print(f"{i}: {label}")

In [21]:
for column in categorical_columns:
  le.fit(X_train[column])
  X_train[column] = le.transform(X_train[column])

for column in categorical_columns:
  le.fit(X_test[column])
  X_test[column] = le.transform(X_test[column])

In [22]:
label_encoders = {
    'gender': LabelEncoder().fit(['Female', 'Male']),
    'Partner': LabelEncoder().fit(['No', 'Yes']),
    'Dependents': LabelEncoder().fit(['No', 'Yes']),
    'PhoneService': LabelEncoder().fit(['No', 'Yes']),
    'PaperlessBilling': LabelEncoder().fit(['No', 'Yes']),
    'Contract': LabelEncoder().fit(['Month-to-month', 'One year', 'Two year']),
    'MultipleLines': LabelEncoder().fit(['No', 'No phone service', 'Yes']),
    'InternetService': LabelEncoder().fit(['DSL', 'Fiber optic', 'No']),
    'OnlineSecurity': LabelEncoder().fit(['No', 'No internet service', 'Yes']),
    'OnlineBackup': LabelEncoder().fit(['No', 'No internet service', 'Yes']),
    'DeviceProtection': LabelEncoder().fit(['No', 'No internet service', 'Yes']),
    'TechSupport': LabelEncoder().fit(['No', 'No internet service', 'Yes']),
    'StreamingTV': LabelEncoder().fit(['No', 'No internet service', 'Yes']),
    'StreamingMovies': LabelEncoder().fit(['No', 'No internet service', 'Yes']),
    'PaymentMethod': LabelEncoder().fit(['Bank transfer (automatic)', 'Credit card (automatic)', 'Electronic check', 'Mailed check'])
}

# Saving the encoder to a file
with open('encoder.pkl', 'wb') as file:
    pickle.dump(label_encoders, file)

# Exploratory Data Analysis

In [23]:
scaler = StandardScaler()

features = ['MonthlyCharges', 'TotalCharges', 'tenure']
df[features] = scaler.fit_transform(df[features])

## Data Summary

In [24]:
# Data Summary
print("---- Data Summary ----")

# Partner and Senior Citizen summary
print("\nCount of people with or without partners:")
print(df['Partner'].value_counts())
print('-----------------------------------------------------------')

print("\nCount of people with or without partners by gender:")
print(df.groupby('gender')['Partner'].value_counts())
print('-----------------------------------------------------------')

# Dependents summary
print("\nCount of people with and without dependents:")
print(df['Dependents'].value_counts())
print('-----------------------------------------------------------')

print("\nCount of dependents by Senior Citizen status:")
print(df.groupby('SeniorCitizen')['Dependents'].value_counts())
print('-----------------------------------------------------------')

print("\nCount of dependents by Gender:")
print(df.groupby('gender')['Dependents'].value_counts())
print('-----------------------------------------------------------')

# Tenure summary
print("\nTenure summary:")
print(f"Minimum Tenure: {df['tenure'].min()} months")
print(f"Maximum Tenure: {df['tenure'].max()} months")
print(f"Average Tenure: {df['tenure'].mean():.1f} months")
print(f"Most Common Tenure: {df['tenure'].mode()[0]} months")
print('-----------------------------------------------------------')

# PhoneService summary
print("\nPhoneService subscription counts:")
print(df['PhoneService'].value_counts())
print('-----------------------------------------------------------')

print("\nPhoneService by Senior Citizen status:")
print(df.groupby('PhoneService')['SeniorCitizen'].value_counts())
print('-----------------------------------------------------------')

# MultipleLines summary
print("\nMultiple Lines subscription counts:")
print(df['MultipleLines'].value_counts())
print('-----------------------------------------------------------')

# InternetService counts
print("\nInternet Service counts:")
print(df['InternetService'].value_counts())
print('-----------------------------------------------------------')

# Group by gender and InternetService
print("\nInternet Service distribution by gender:")
print(df.groupby('gender')['InternetService'].value_counts())
print('-----------------------------------------------------------')

# Group by Senior Citizen
print("\nInternet Service distribution by Senior Citizen status:")
print(df.groupby('SeniorCitizen')['InternetService'].value_counts())
print('-----------------------------------------------------------')

# Streaming Services summary
print("\nStreamingTV distribution by Dependents:")
print(df.groupby('Dependents')['StreamingTV'].value_counts())
print('-----------------------------------------------------------')

print("\nStreamingMovies distribution:")
print(df['StreamingMovies'].value_counts())
print('-----------------------------------------------------------')

# Contract summary
print("\nContract distribution:")
print(df['Contract'].value_counts())
print('-----------------------------------------------------------')

# Paperless Billing summary
print("\nPaperless Billing distribution:")
print(df['PaperlessBilling'].value_counts())

total_paperless = df['PaperlessBilling'].value_counts().sum()
paperless_counts = df['PaperlessBilling'].value_counts()
paperless_use_percentage = paperless_counts[0] / total_paperless
paperless_no_use_percentage = paperless_counts[1] / total_paperless

print(f"{paperless_use_percentage:.2f} % use Paper Billing while {paperless_no_use_percentage:.2f}% do not use Paper Billing")
print('-----------------------------------------------------------')

print("\nPaperless Billing distribution by Gender:")
print(df.groupby('gender')['PaperlessBilling'].value_counts())
print('-----------------------------------------------------------')

print("\nPaperless Billing distribution by Senior Citizen status:")
print(df.groupby('SeniorCitizen')['PaperlessBilling'].value_counts())
print('-----------------------------------------------------------')

print("\nPaperless Billing distribution by Dependents:")
print(df.groupby('Dependents')['PaperlessBilling'].value_counts())
print('-----------------------------------------------------------')

print("\nPaperless Billing distribution by Contract type:")
print(df.groupby('Contract')['PaperlessBilling'].value_counts())

## Visualize Distributions

In [25]:
print("---- Visualizations ----")

# Gender Distribution
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='gender')
plt.grid()
plt.title("Gender Distribution")
plt.show()

# SeniorCitizen vs Gender
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='SeniorCitizen', hue='gender')
plt.grid()
plt.title("SeniorCitizen VS Gender")
plt.show()

# Partner vs SeniorCitizen and Partner vs Gender
plt.figure(figsize=(11, 6))

# Subplot 1: Partner vs Senior Citizen
plt.subplot(1, 2, 1)
sns.countplot(data=df, x='Partner', hue='SeniorCitizen')
plt.grid()
plt.title("Partner VS Senior Citizen")

# Subplot 2: Partner vs Gender
plt.subplot(1, 2, 2)
sns.countplot(data=df, x='Partner', hue='gender')
plt.grid()
plt.title("Partner VS Gender")

plt.tight_layout()
plt.show()

# Dependents vs SeniorCitizen and Dependents vs Gender
plt.figure(figsize=(11, 6))

# Subplot 1: Dependents vs Senior Citizen
plt.subplot(1, 2, 1)
sns.countplot(data=df, x='Dependents', hue='SeniorCitizen')
plt.grid()
plt.title("Dependents VS Senior Citizen")

# Subplot 2: Dependents vs Gender
plt.subplot(1, 2, 2)
sns.countplot(data=df, x='Dependents', hue='gender')
plt.grid()
plt.title("Dependents VS Gender")

plt.tight_layout()
plt.show()

# Tenure Distribution by Gender
plt.figure(figsize=(8, 5))
sns.histplot(data=df, x='tenure', hue='gender', kde=True, multiple="stack")
plt.title("Tenure Distribution by Gender")
plt.grid()
plt.show()

# PhoneService related plots
plt.figure(figsize=(16, 5))

# Subplot 1: PhoneService vs Dependents
plt.subplot(1, 3, 1)
sns.countplot(data=df, x='PhoneService', hue='Dependents')
plt.grid()
plt.title("PhoneService VS Dependents")

# Subplot 2: PhoneService vs Senior Citizen
plt.subplot(1, 3, 2)
sns.countplot(data=df, x='PhoneService', hue='SeniorCitizen')
plt.grid()
plt.title("PhoneService VS Senior Citizen")

# Subplot 3: PhoneService vs Gender
plt.subplot(1, 3, 3)
sns.countplot(data=df, x='PhoneService', hue='gender')
plt.grid()
plt.title("PhoneService VS Gender")

plt.tight_layout()
plt.show()

# InternetService related plots
plt.figure(figsize=(16, 4))

# Subplot 1: InternetService vs PhoneService
plt.subplot(1, 3, 1)
sns.countplot(data=df, x='InternetService', hue='PhoneService')
plt.grid()
plt.title("InternetService VS PhoneService")

# Subplot 2: InternetService vs SeniorCitizen
plt.subplot(1, 3, 2)
sns.countplot(data=df, x='InternetService', hue='SeniorCitizen')
plt.grid()
plt.title("InternetService VS SeniorCitizen")

# Subplot 3: InternetService vs Gender
plt.subplot(1, 3, 3)
sns.countplot(data=df, x='InternetService', hue='gender')
plt.grid()
plt.title("InternetService VS Gender")

plt.tight_layout()
plt.show()

# StreamingMovies related plots
plt.figure(figsize=(16, 4))

# Subplot 1: StreamingMovies vs Dependents
plt.subplot(1, 3, 1)
sns.countplot(data=df, hue='StreamingMovies', x='Dependents')
plt.grid()
plt.title("StreamingMovies VS Dependents")

# Subplot 2: StreamingMovies vs SeniorCitizen
plt.subplot(1, 3, 2)
sns.countplot(data=df, hue='StreamingMovies', x='SeniorCitizen')
plt.grid()
plt.title("StreamingMovies VS SeniorCitizen")

# Subplot 3: StreamingMovies vs Gender
plt.subplot(1, 3, 3)
sns.countplot(data=df, hue='StreamingMovies', x='gender')
plt.grid()
plt.title("StreamingMovies VS Gender")

plt.tight_layout()
plt.show()

# Contract related plots
plt.figure(figsize=(16, 4))

# Subplot 1: Contract vs Dependents
plt.subplot(1, 3, 1)
sns.countplot(data=df, hue='Contract', x='Dependents')
plt.grid()
plt.title("Contract VS Dependents")

# Subplot 2: Contract vs SeniorCitizen
plt.subplot(1, 3, 2)
sns.countplot(data=df, hue='Contract', x='SeniorCitizen')
plt.grid()
plt.title("Contract VS SeniorCitizen")

# Subplot 3: Contract vs Gender
plt.subplot(1, 3, 3)
sns.countplot(data=df, hue='Contract', x='gender')
plt.grid()
plt.title("Contract VS Gender")

plt.tight_layout()
plt.show()

# PaperlessBilling related plots
plt.figure(figsize=(12, 6))

# Subplot 1: PaperlessBilling vs Dependents
plt.subplot(1, 2, 1)
sns.countplot(data=df, hue='PaperlessBilling', x='Dependents')
plt.grid()
plt.title("PaperlessBilling VS Dependents")

# Subplot 2: PaperlessBilling vs SeniorCitizen
plt.subplot(1, 2, 2)
sns.countplot(data=df, hue='PaperlessBilling', x='SeniorCitizen')
plt.grid()
plt.title("PaperlessBilling VS SeniorCitizen")

plt.tight_layout()
plt.show()

# PaperlessBilling vs Gender and Contract
plt.figure(figsize=(12, 6))

# Subplot 1: PaperlessBilling vs Gender
plt.subplot(1, 2, 1)
sns.countplot(data=df, hue='PaperlessBilling', x='gender')
plt.grid()
plt.title("PaperlessBilling VS Gender")

# Subplot 2: PaperlessBilling vs Contract
plt.subplot(1, 2, 2)
sns.countplot(data=df, hue='PaperlessBilling', x='Contract')
plt.grid()
plt.title("PaperlessBilling VS Contract")

plt.tight_layout()
plt.show()

## Correlation Matrix

### Independent Features Correlation

In [26]:
# Select numerical features and drop the 'Churn' column
independent_features_df = df.select_dtypes(include=['number'])

# Compute the correlation matrix
corr_matrix = independent_features_df.corr()

# Create a mask to hide the upper triangle of the heatmap
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Set the figure size and font scale for better readability
plt.figure(figsize=(18, 8))
sns.set(font_scale=1.2)

# Create the heatmap
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", mask=mask, vmin=-1, vmax=1)

# Set the title and show the plot
plt.title("Independent Features Correlation Heatmap")
plt.show()

### Inpedendent features correlation with prediction labels

In [27]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Compute the correlation matrix
correlation_data = df.select_dtypes(include=['number']).corr()['Churn']

# Convert Series to DataFrame for heatmap
correlation_data = correlation_data.to_frame()

# Create the heatmap
plt.figure(figsize=(15, 8))
sns.set(font_scale=1.2)
sns.heatmap(correlation_data, annot=True, cmap="coolwarm", cbar=True, vmin=-1, vmax=1)
# sns.heatmap(churn_correlation.to_frame(), annot=True, cmap="coolwarm", cbar=True)

plt.title("Correlation Heatmap between Independent Features and Churn")
plt.show()

# **Model Building**

In [28]:
def print_evaluation_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    results = [accuracy, precision, recall, f1]

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print()
    
    class_report = classification_report(y_true, y_pred)
    print("Classification Report:")
    print(class_report)
    
    conf_matrix = confusion_matrix(y_true, y_pred)
    cm_display = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=[0, 1])
    print("Confusion Matrix:")
    print(conf_matrix)
    cm_display.plot()
    plt.show()
    print()

    return results


In [29]:
def k_fold_cross_validation_with_metrics(classifier, X, y, k_folds=5):
    
    # Initializing stratified k-fold cross-validation
    stratified_kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    # Lists to store the evaluation metrics for each fold
    accuracy_scores = []
    precision_scores = []
    recall_scores = []

    # Perform cross-validation
    for train_index, test_index in stratified_kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit the classifier on the training data
        classifier.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = classifier.predict(X_test)

        # Calculate evaluation metrics for this fold
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)

        # Append the metrics to their respective lists
        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)

    # Calculate and print the mean of each metric across all folds
    mean_accuracy = np.mean(accuracy_scores)
    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)
    print("Mean Metrics Across Folds:")
    print(f"Mean Accuracy: {mean_accuracy:.2f}")
    print(f"Mean Precision: {mean_precision:.2f}")
    print(f"Mean Recall: {mean_recall:.2f}")

## **K Nearest Neighbour (KNN)**

### **Training the Model**

In [30]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

### **Applying the Model to make a Prediction**

In [31]:
y_knn_pred = knn.predict(X_test)

In [32]:
y_knn_pred

### **Evaluate Model Performance**

In [33]:
knn_metrics = print_evaluation_metrics(y_test, y_knn_pred)

In [34]:
knn_results = pd.DataFrame(['K Nearest Neighbour', knn_metrics[0], knn_metrics[1], knn_metrics[2], knn_metrics[3]]).transpose()
knn_results.columns = ['Method', 'Accuracy', 'Precision', 'Recall', 'F1 Score']

In [35]:
knn_results

## **Support Vector Machines (SVM)**



### **Training the Model**

In [36]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train, y_train)

### **Applying the Model to make a Prediction**

In [37]:
y_svm_pred = svm.predict(X_test)

In [38]:
y_svm_pred

### **Evaluate Model Performance**

In [39]:
svm_metrics = print_evaluation_metrics(y_test, y_svm_pred)

In [40]:
svm_results = pd.DataFrame(['Support Vector Machines', svm_metrics[0], svm_metrics[1], svm_metrics[2], svm_metrics[3]]).transpose()
svm_results.columns = ['Method', 'Accuracy', 'Precision', 'Recall', 'F1 Score']

In [41]:
svm_results

## **Logistic Regression**

### **Training the Model**

In [42]:
lr = LogisticRegression(max_iter=1000) #the max iter need to be removed after done the data prep processing
lr.fit(X_train, y_train)

### **Applying the Model to make a Prediction**

In [43]:
y_lr_pred = lr.predict(X_test)

In [44]:
y_lr_pred

### **Evaluate Model Performance**

In [45]:
lr_metrics = print_evaluation_metrics(y_test, y_lr_pred)

In [46]:
lr_results = pd.DataFrame(['Logistic Regression', lr_metrics[0], lr_metrics[1], lr_metrics[2], lr_metrics[3]]).transpose()
lr_results.columns = ['Method', 'Accuracy', 'Precision', 'Recall', 'F1 Score']

In [47]:
lr_results

## **Decision Tree**

### **Training the Model**

In [48]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

### **Applying the Model to make a Prediction**

In [49]:
y_dt_pred = dt.predict(X_test)

In [50]:
y_dt_pred

### **Evaluate Model Performance**

In [51]:
dt_metrics = print_evaluation_metrics(y_test, y_dt_pred)

In [52]:
dt_results = pd.DataFrame(['Decision Tree', dt_metrics[0], dt_metrics[1], dt_metrics[2], dt_metrics[3]]).transpose()
dt_results.columns = ['Method', 'Accuracy', 'Precision', 'Recall', 'F1 Score']

In [53]:
dt_results

## **Random Forest**

### **Training the Model**

In [54]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

### **Applying the Model to make a Prediction**

In [55]:
y_rf_pred = rf.predict(X_test)

In [56]:
y_rf_pred

### **Evaluate Model Performance**

In [57]:
rf_metrics = print_evaluation_metrics(y_test, y_rf_pred)

In [58]:
rf_results = pd.DataFrame(['Random Forest', rf_metrics[0], rf_metrics[1], rf_metrics[2], rf_metrics[3]]).transpose()
rf_results.columns = ['Method', 'Accuracy', 'Precision', 'Recall', 'F1 Score']

In [59]:
rf_results

# **Model Comparison**

## Cross Validation

In [60]:
print("Cross Validation Mean Metrics")
print("\nK Nearest Neighbour CV Mean Metrics: ")
k_fold_cross_validation_with_metrics(knn, X_train, y_train)
print("\nSupport Vector Machine CV Mean Metrics: ")
k_fold_cross_validation_with_metrics(svm, X_train, y_train)
print("\nLogistic Regression CV Mean Metrics: ")
k_fold_cross_validation_with_metrics(lr, X_train, y_train)
print("\nDecision Tree CV Mean Metrics: ")
k_fold_cross_validation_with_metrics(dt, X_train, y_train)
print("\nRandom Forest CV Mean Metrics: ")
k_fold_cross_validation_with_metrics(rf, X_train, y_train)


In [61]:
df_models = pd.concat([knn_results, svm_results, lr_results, dt_results, rf_results], axis=0)
df_models.reset_index(drop=True)

In [62]:
models = {"K Nearest Neighbour": (knn, knn_metrics[0]), 
"Support Vector Machines": (svm, svm_metrics[0]), 
"Logistic Regression": (lr, lr_metrics[0]), 
"Decision Tree": (dt, dt_metrics[0]), 
"Random Forest": (rf, rf_metrics[0])}

Best_acc = 0.0
Best_model = None

for name, (model, acc) in models.items():
    acc = acc*100
    print(f"Accuracy score of {name} is {acc:.2f}%\n")
    
    if acc>Best_acc:
        Best_acc = acc
        Best_model = name
        chosen_model = model
        
# Printing the best parameters and score
print(f"Best Model is {Best_model} with {Best_acc:.2f}% accuracy")


In [63]:
with open("customer_churn_model.pkl", 'wb') as model_file:
    pickle.dump(chosen_model, model_file)

# **Deployment of Prediction**

In [64]:
class CustomerChurnClassifier:
    
    def __init__(self, model_path, encoder_path):
        # Load the model
        with open(model_path, 'rb') as file:
            self.model = pickle.load(file)
        
        # Load the LabelEncoders
        with open(encoder_path, 'rb') as file:
            self.encoders = pickle.load(file)
    
    def predict(self, tenure: int, phone_service: str, multiple_lines: str, internet_service: str, online_security: str, online_backup: str, device_protection: str, tech_support: str, streaming_tv: str, streaming_movies: str, contract: str, paperless_billing: str, payment_method: str, monthly_charges: float, total_charges: float, gender: str, senior_citizen: int, partner: str, dependents: str):
        
        # Checking input datatypes
        expected_data_types = [int, str, str, str, str, str, str, str, str, str, str, str, str, float, float, str, int, str, str]
        input_arguments = [tenure, phone_service, multiple_lines, internet_service, online_security, online_backup, device_protection, tech_support, streaming_tv, streaming_movies, contract, paperless_billing, payment_method, monthly_charges, total_charges, gender, senior_citizen, partner, dependents]
        input_arguments_names = ['tenure', 'phone_service', 'multiple_lines', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'contract', 'paperless_billing', 'payment_method', 'monthly_charges', 'total_charges', 'gender', 'senior_citizen', 'partner', 'dependents']

        for i in range(len(input_arguments)):
            current_arg_type = type(input_arguments[i])
            if current_arg_type != expected_data_types[i]:
                raise TypeError(f"Error: Given {input_arguments_names[i]} ({current_arg_type.__name__}) is not of the expected type ({expected_data_types[i].__name__}).")
        
        # Transform categorical features using LabelEncoder
        encoded_features = [
            self.encoders['gender'].transform([gender])[0],
            self.encoders['Partner'].transform([partner])[0],
            self.encoders['Dependents'].transform([dependents])[0],
            self.encoders['PhoneService'].transform([phone_service])[0],
            self.encoders['MultipleLines'].transform([multiple_lines])[0],
            self.encoders['InternetService'].transform([internet_service])[0],
            self.encoders['OnlineSecurity'].transform([online_security])[0],
            self.encoders['OnlineBackup'].transform([online_backup])[0],
            self.encoders['DeviceProtection'].transform([device_protection])[0],
            self.encoders['TechSupport'].transform([tech_support])[0],
            self.encoders['StreamingTV'].transform([streaming_tv])[0],
            self.encoders['StreamingMovies'].transform([streaming_movies])[0],
            self.encoders['Contract'].transform([contract])[0],
            self.encoders['PaperlessBilling'].transform([paperless_billing])[0],
            self.encoders['PaymentMethod'].transform([payment_method])[0]
        ]
        
        # Combine numerical and encoded features
        to_predict_array = [senior_citizen, tenure, monthly_charges, total_charges] + encoded_features
        to_predict_array = np.array(to_predict_array).reshape((1, -1))
        
        # Make prediction
        prediction = self.model.predict(to_predict_array)[0]

        if prediction > 0.5:
            return 'Will Churn'
        else:
            return "Won't Churn"

In [65]:
customer_churn = CustomerChurnClassifier(
    model_path = 'customer_churn_model.pkl', 
    encoder_path = 'encoder.pkl'
)

In [66]:
customer_churn.predict(
    tenure=18,
    phone_service='No',
    multiple_lines='No phone service',
    internet_service='DSL',
    online_security='No',
    online_backup='No',
    device_protection='Yes',
    tech_support='No',
    streaming_tv='Yes',
    streaming_movies='No',
    contract='Month-to-month',
    paperless_billing='No',
    payment_method='Electronic check',
    monthly_charges=70.50,
    total_charges=1250.00,
    gender='Female',
    senior_citizen=0,
    partner='No',
    dependents='Yes'
)

"Won't Churn"