# Problem_Statement :-- 
##### Churn Prediction is one of the most popular big data use cases in business.It consist of detecting customers who are likely to cancel a subscription to a service.
##### Churn is a problem for telecom companies because it is more expensive to acquire a new customer than to keep your existing one from leaving

# OBJECTIVE OF THE PROJECT :-
#### >>To predict customer churn
#### >>Highlighting the main variables/factors influencing customer churn
#### >>Use various ML algorithms to build prediction models,evaluates the accuracy and performance of these models.
#### >>Finding out the best model for our business case & providing executive summary

## Importing libraries and Loading the DATASET

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("telecommunications_churn.csv")

In [None]:
df.head()

## Data Preprocessing 

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

## Exploratory Data Analysis[EDA]

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class

In [None]:
%matplotlib inline    
av = AutoViz_Class()
df_av = av.AutoViz("telecommunications_churn.csv")

## Feature Engineering

In [None]:
#### Feature Engineerin method-1
df.corr()['churn']

In [None]:
X = df.iloc[:,0:18]
Y = df['churn']

In [None]:
X

In [None]:
#### Feature Engineering method-2
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier


In [None]:
model = DecisionTreeClassifier()

In [None]:
rfe = RFE(model,n_features_to_select = 6)
fit = rfe.fit(X,Y)

In [None]:
pd.DataFrame({'Columns':X.columns,"Ranking":fit.ranking_})

In [None]:
df.corr()['churn']

###### We are comparing with correlation method and RFE method.There is column which have less correlation with thetarget variable.Hencge we are droping the respective columns (evening-calls,night-calls)

In [None]:
df.drop(['evening_calls','night_calls'],axis =1,inplace= True)

In [None]:
df.head()

In [None]:
df.shape

### Feature Selection

In [None]:
x = df.iloc[:,0:16]
y = df['churn']

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train, y_test = train_test_split(x,y,train_size=0.65,random_state=42)

# MODEL BUILDING

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [None]:
model = []
model.append(('LR',LogisticRegression()))
model.append(('KNN',KNeighborsClassifier()))
model.append(('RF',RandomForestClassifier()))
model.append(('XGBM',XGBClassifier()))
model.append(('SVM',SVC()))
model.append(('DT',DecisionTreeClassifier()))

In [None]:
results = []
names = []

for name, models in model:
	kfold = KFold(n_splits=10)
	cv_results = cross_val_score(models,x,y,cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

# XGBOOST and RANDOM FOREST has high accuracy when compared to other models

### Hyper Parameter Optimization

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier

kfold = KFold(n_splits=10)
f_model = RandomForestClassifier()

param = {"n_estimators":[50,100,150,200],"criterion":["gini","entropy"]}

grid = GridSearchCV(estimator = f_model,param_grid = param,cv = KFold())

In [None]:
grid_result = grid.fit(x_train, y_train)

In [None]:
print('Best : {}, using {}'.format(grid_result.best_score_,grid_result.best_params_))

In [None]:
# Building the final model with best parameter
model_final = RandomForestClassifier(criterion= 'gini', n_estimators= 150)
model_final.fit(x_train, y_train)

In [None]:
# Predicting the model using test data
pred=model_final.predict(x_test)

In [None]:
#Accuracy for the test data
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)

In [None]:
#confusion metrix
confusion_matrix(pred,y_test)

## Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn import set_config
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
steps=[("Standard Scaler",StandardScaler()),
      ("Model",RandomForestClassifier(criterion= 'gini', n_estimators= 150))]

In [None]:
pipe=Pipeline(steps)

In [None]:
#for visualizing the pipeline
set_config(display="diagram")
pipe

In [None]:
X = df.drop("churn",axis=1)
y = df["churn"]

In [None]:
X

In [None]:
from sklearn_pandas import DataFrameMapper
X.columns

In [None]:
pipe.fit(X,y)

## Model Saving

In [None]:
import pickle
f1 = open("churn_final_data.pkl",mode="wb")

In [None]:
pickle.dump(pipe,f1)
f1.close()