 <div style="text-align:center; color:red; font-size:40px;">Customer Churn Prediction</div>

# What is the Customer Churn:

**Customer churn**, also known as customer attrition, refers to the rate at which customers stop doing business with a company over a given period.
 It is a key metric for businesses, especially in subscription-based models, telecommunications, banking, e-commerce, and SaaS (Software as a Service) industries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
pd.set_option('display.max_columns', None)

In [None]:
data=pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df0=data.copy()
df0.head()

In [None]:

print(f'the Shape of Data: {df0.shape}')
   

In [None]:
df0.describe().T

In [None]:
for col in df0.columns:
    print(f'the  values of {col}:{df0[col].unique().tolist()}')
    print('******************************************************************')

In [None]:
df0.info()

In [None]:
df0.duplicated().sum()


In [None]:
df0.drop(columns=['customerID'],inplace=True)
df0['TotalCharges']=pd.to_numeric(df0['TotalCharges'],errors='coerce')



In [None]:
df0.dtypes

In [None]:
for col in df0.columns:
    print(f'{col}: {df0[col].value_counts()}')
    print('*********************************************************')

In [None]:
df0.isna().sum()

In [None]:
df0.info()

# EDA

In [None]:
df1=df0.copy()

In [None]:
numcol=[col for col in df1.select_dtypes(include=[float,int]).columns if col!= 'SeniorCitizen']
for col in numcol:
    print(f'The skewness of {col}:{df1[col].skew()}')
    print('******************************************************')

In [None]:
# Univariate Analysis:
## Categorical
catcol=df1.select_dtypes(include=object).columns.tolist() + ['SeniorCitizen']
sns.set_palette('pastel')
for col in catcol:
    plt.figure(figsize=(12, 5))

    plt.subplot(1,2,1)
    sns.countplot(data=df1,x=df1[col],hue='Churn')
    plt.title(f'Distribution of {col}',fontsize=12)
    
    plt.subplot(1,2,2)
    df1[col].value_counts().plot.pie(autopct='%1.1f%%')
    plt.title(f'{col} Distribution',fontsize=12)

    plt.tight_layout()
    plt.show()

In [None]:
# Numerical Distribution:
for col in numcol:
    plt.figure(figsize=(12,5))
    plt.subplot(1,2,1)
    sns.histplot(data=df1,x=df1[col], kde=True,color='blue' , edgecolor='black')
    plt.title(f'{col} Distribution',fontsize=14)
    plt.ylabel(f'Frequency',fontsize=12)

    
    # Second subplot: Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(y=df1[col], color='orange')
    plt.title(f"Boxplot of {col}", fontsize=14)
    plt.ylabel(col, fontsize=12)

    plt.tight_layout()
    plt.show()

# Building Models

In [None]:
df2=df1.copy()
df2.head()

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV,KFold 
from sklearn.preprocessing import StandardScaler,LabelEncoder
from imblearn.over_sampling import SMOTE

In [None]:
# Encoding the dependent Feature:
df2['Churn'].replace({'Yes':1,'No':0},inplace=True)
df2['Churn']

In [None]:
df2.info()

In [None]:
numerical_col=['tenure', 'MonthlyCharges','TotalCharges']
categorical_col=df2.select_dtypes(include=object).columns.tolist()

#Encode Numerical data using StandardScaler
standard={}
for col in numerical_col:
    stsc=StandardScaler()
    df2[col]=stsc.fit_transform(df2[[col]])
    standard[col]=stsc


# Encode Categorical data using LabelEncoder:

encoders={}
for col in categorical_col:
    labelencoder=LabelEncoder()
    df2[col]=labelencoder.fit_transform(df2[col])
    encoders[col]=labelencoder




In [None]:
standard

In [None]:
encoders

In [None]:
import pickle
with open('encoder.pkl','wb') as f:
    pickle.dump(encoders,f)

with open('scaler.pkl','wb') as f:
    pickle.dump(standard,f)

In [None]:
df2.dropna(inplace=True)
df2.isna().sum()

In [None]:
# Define the output and the inputs
X=df2.iloc[:,:-1]
y=df2['Churn']

#  Split the data set 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X.isna().sum()


In [None]:
# Check if the output is imbalanced

max_count= y_train.value_counts().max()
min_count= y_train.value_counts().min()
IR=max_count /min_count
IR
# the output is imbalanced

In [None]:
# Resampling the output
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
X_train_smote,y_train_smote=smote.fit_resample(X_train,y_train)

y_train_smote.value_counts()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
models={
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}
Hyperparmeter={
    'Random Forest':{'n_estimators':[50,100,200],
                     'max_depth':[3,5,7],
    },
    'XGBoost':{'learning_rate':[0.01,0.1,0.2],
               'max_depth':[3,5,7],

    }

}

In [None]:
cv1=KFold(n_splits=5,shuffle=True,random_state=42)
best_models={}
for model_name,model in models.items():
    print(f"Tunning {model_name}..............")
    grid=GridSearchCV(estimator=model,param_grid=Hyperparmeter[model_name],cv=cv1,scoring='accuracy')
    grid.fit(X_train_smote,y_train_smote)
    best_models[model_name]=grid.best_estimator_
    print(f"Best parameters for {model_name}:{grid.best_params_}")
    print(f"The best Accuracy for {model_name}: {grid.best_score_: .2f}\n")









In [None]:
from sklearn.metrics import confusion_matrix,roc_auc_score, classification_report,accuracy_score
y_train_pred= best_models['Random Forest'].predict(X_train_smote)
y_test_pred=best_models['Random Forest'].predict(X_test)

In [None]:
print(f" The Accuracy of Train data: {accuracy_score(y_train_pred,y_train_smote)}\n The Accuracy of Test data: {accuracy_score(y_test_pred,y_test)}")
print('***************************************************************************************************************************************')
print(f"The confusion matrix of train data:\n {confusion_matrix(y_train_pred,y_train_smote)}\n The confusion matrix of test data\n {confusion_matrix(y_test_pred,y_test)}")
print('***************************************************************************************************************************************')
print(f"the classification report of Train data:\n {classification_report(y_test_pred,y_test)}")
print('***************************************************************************************************************************************')
print(f"The ROC-AUC of train data:\n {roc_auc_score(y_train_pred,y_train_smote)}\n The ROC-AUC of test data:\n {roc_auc_score(y_test_pred,y_test)}")

In [None]:
# visulalize the Confusion matrix
cm_train=confusion_matrix(y_train_pred,y_train_smote)
cm_test=confusion_matrix(y_test_pred,y_test)


sns.set_palette('pastel')
plt.figure(figsize=(6,5))
plt.subplot(1,2,1)
sns.heatmap(cm_train, annot=True, fmt="d", cmap="Blues", xticklabels=["0", "1"], yticklabels=["0", "1"])
plt.xlabel("Predicted Label",fontsize=10)
plt.ylabel("True Label",fontsize=10)
plt.title("Confusion Matrix - Train Data",fontsize=12)
plt.subplot(1,2,2)
sns.heatmap(cm_test, annot=True, fmt="d", cmap="Blues", xticklabels=["0", "1"], yticklabels=["0", "1"])
plt.xlabel("Predicted Label", fontsize=10)
plt.ylabel("True Label",fontsize=10)
plt.title("Confusion Matrix - Test Data",fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
#import pickle
#with open('encoder.pkl','wb') as f:
    #pickle.dump(encoders,f)

#with open('scaler.pkl','wb') as f:
    #pickle.dump(standard,f)

with open("best_model.pkl", "wb") as f:
    pickle.dump(best_models['Random Forest'], f)



In [None]:
with open("best_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)
with open("encoder.pkl", "rb") as f:
    encoders = pickle.load(f)
with open("scaler.pkl", "rb") as f:
    scaler_data =pickle.load(f)

In [None]:
example_input = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 1,
    'PhoneService': 'No',
    'MultipleLines': 'No phone service',
    'InternetService': 'DSL',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'No',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 29.85,
    'TotalCharges': 29.85
}


In [None]:
print(loaded_model.feature_names_in_)


In [None]:
# Load models, encoders, and scalers
with open("best_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)
with open("encoder.pkl", "rb") as f:
    encoders = pickle.load(f)
with open("scaler.pkl", "rb") as f:
    scaler_data = pickle.load(f)

# Function to preprocess new input data
def preprocess_input(input_data):
    input_df = pd.DataFrame([input_data])
    
    # Apply encoding to categorical features
    for col, encoder in encoders.items():
        if col in input_df.columns:  # Ensure the column exists
            input_df[col] = encoder.transform(input_df[col])

    # Apply scaling to numerical features
    for col, scaler in scaler_data.items():
        if col in input_df.columns:  # Ensure the column exists
            input_df[col] = scaler.transform(input_df[[col]])

    return input_df

# Function to make prediction
def make_prediction(input_data):
    input_df = preprocess_input(input_data)
    
    # Predicting churn and probability using the loaded model
    prediction = loaded_model.predict(input_df)[0]
    probability = loaded_model.predict_proba(input_df)[0, 1]
    
    return "Churn" if prediction == 1 else "No Churn", probability



In [None]:
# Example input data (replace with actual new input data)
example_input = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 1,
    'PhoneService': 'No',
    'MultipleLines': 'No phone service',
    'InternetService': 'DSL',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'No',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 29.85,
    'TotalCharges': 29.85
}

# Get prediction for the example input
prediction, probability = make_prediction(example_input)
print(f"Prediction: {prediction}, Probability: {probability:.2f}")
