In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for data visualization

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Standard Lifecycle for any data science project to be followed

1. Domain Exploration
    * Understand the business process, how business functions
    * Identify few problems and the beleif based solutions
    * Identify opportunities where business relies on expertise of an SME.
2. Data Collection and Data Exploration
    * understand the data structure
    * Ask for data dictionary to the business
    * explore data to identify issues with data quality and patterns in general with in data
3. Data cleaning
    * handle unwanted columns
    * handle missing values
    * handle duplicate entries
    * handle outliers and un natural values
4. Feature Engineering
    * Feature Extraction
        * use data engieering and statistics to create new useful features from existing data
    * Feature Selection - select best features which are relevant to predict the label
        * EDA ( Exploratory Data Analytics ) using Data Visualization
        * EDA using statistics
        * Wrapper Methods
        * Embedded Methods
        * Filter methods
5. Preprocessing of data
    * encoding of data
    * scaling of features
    * splitting data into train and test sets
6. Apply ML to build a predictive model
    * Use a ML algorithm and train using train set
7. Performance Analysis - How good the model is?
    * check performance of model using train data and test data
8. Optimization & Tuning
    * improve the performance of the ML algorithm
9. Deploy the model to production
    * Export the model as a portable file
    * deploy it as REST service
10. Monitoring the perfomance of a model in production

In [None]:
# loading data
df = pd.read_csv(os.path.join(dirname, filename))
df.shape

# 2. Data Exploration

In [None]:
df.head()

Oservation - 
* Voice mail plan and numer vmail messages seems to have similar information, as those who will be having voice mail plan will be having the value of number vmail messages higher than 0
* total day charges and total day minutes should be correlated as there would be a multiplier of per minute charge used to calculate total day charge from total day minutes, same also applied for total eve minutes, total night minutes and total internation mins
* Phone number, state and area code seems to be identifier so may not have any quantitative or qualittaive info

In [None]:
df.info()

In [None]:
#checking the categories for object data types
print(df['international plan'].unique())
print(df['voice mail plan'].unique())

In [None]:
df.describe(include='all')

# 3. Data Cleaning

In [None]:

#check for missing values
df.isnull().sum()

#### No Missing values observed

In [None]:

# check for duplicated rows
df.duplicated().sum()

#### No dulicated rows observed

In [None]:
# check for outliers
df.skew()

* Number vmail messages - we can take action while performing feature extraction
* Total intl class - we will take action during correlation analysis
* Customer service calls - the skew is almost 1, thus we can go ahead without an action

# 4. Feature Engineering
## Feature Extraction

In [None]:

df['number vmail messages'].describe()

In [None]:
df['number vmail messages'][df['number vmail messages']>0].describe()

In [None]:
# New categorical feature = 
                # if numofvmailmessage <1 = No VM plan
                # if numofvmailmessage >1 and <38 = Normal users
                # if numofvmailmessage >38 and <53= High Frequency users

In [None]:
df['vmail_messages'] = pd.cut(df['number vmail messages'],bins=[0,1,38,52],
                             labels=['No VM plan','Normal Users','High Frequency users'],
                             include_lowest=True)
df.head(20)

## Feature Selection

### Correlation Analysis

In [None]:
cor = df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(cor.round(3),annot=True,cmap='coolwarm')
plt.show()

Observation
* Total day minutes & total day charge - correlation 1 >> we can drop one of these
* Total eve minutes & total eve charge - correlation 1 >> we can drop one of these
* Total night minutes & total night charge - correlation 1 >> we can drop one of these
* Total intl minutes & total intl charge - correlation 1 >> we can drop one of these

### Feature selection using ANOVA


#### ANOVA is used for comparing the distribution of a numeric variable in two or more groups
* Ho = Null Hypothesis = the distribution of the varible in multiple groups is uniform
* Ha = Alternate Hypothesis = the distribution of the variable in multiple groups in different
    
    we analyse the pvalue, lets say for confidence interval of 95%, significance level = 5%

`if pvalue>0.05 = accept the null hypothesis and the feature is NOT important`
`if pvalue <0.05 = reject the null hypothesis and the feature is important`

In [None]:
df.columns

In [None]:
numerics =['account length','number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls']
xnum = df[numerics]
y = df['churn']
from sklearn.feature_selection import f_classif
fval,pval = f_classif(xnum,y)
for i in range(len(numerics)):print(numerics[i],pval[i])

### Feature selection using Chi Square Test

- Used to compare the distribution of categories of a categorical feature in two or more groups
- in nutshell to compare whether a categorical attribute has some relationship with the other categorical attribute

* H0 = Null Hypothesis = the categorical attribute has uniform distribution in two or more groups
* Ha = Alternate hypothesis = the categorical attribute has different distribution in two or more groups

We always analyse the pvalue, consider 95% as confidence interval, significance level = 5% i.e.0.05

`if pvalue >0.05 = accept the Null hypothesis - feature is not important`
`if pvalue <0.05 = reject the Null hypothesis - feature is important`

In [None]:

categories = ['state','area code','phone number', 'international plan',
       'voice mail plan','vmail_messages']

y = df['churn']
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
for col in categories:
    xcat = LabelEncoder().fit_transform(df[col]).reshape(-1,1)
    cval,pval = chi2(xcat,y)
    print(col,pval)

In [None]:
#selecting important features based on previous analysis
x = df[['international plan','vmail_messages','total day minutes','total eve minutes',
     'total night minutes','total intl minutes','customer service calls']]
y = df['churn']

# 5. Preprocessing

In [None]:
x.head()

#### encoding categorical features

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
preprocessor = ColumnTransformer([('ohe',OneHotEncoder(),[1]),
                                ('ode',OrdinalEncoder(),[0]),
                                 ('sc',StandardScaler(),[2,3,4,5,6])],remainder='passthrough')

In [None]:
x_new = preprocessor.fit_transform(x)
pd.DataFrame(x_new).head()

In [None]:
# train test split
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x_new,y,test_size=0.2,random_state=5)
print(x.shape)
print(xtrain.shape)
print(xtest.shape)
print(y.shape)
print(ytrain.shape)
print(ytest.shape)

# 6. Apply Machine Learning algorithm - Logistic regression¶

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(class_weight='balanced')
model.fit(xtrain,ytrain)


# 7. Performance Analysis

In [None]:
# performance analysis
from sklearn import metrics
ypred = model.predict(xtest)
print("Accuracy : ",metrics.accuracy_score(ytest,ypred))
print("Recall : ",metrics.recall_score(ytest,ypred))
print("F1 score : ",metrics.f1_score(ytest,ypred))
print("Precision : ",metrics.precision_score(ytest,ypred))

In [None]:
# performance analysis on train data
ypred2 = model.predict(xtrain)
print("Accuracy : ",metrics.accuracy_score(ytrain,ypred2))
print("Recall : ",metrics.recall_score(ytrain,ypred2))
print("F1 score : ",metrics.f1_score(ytrain,ypred2))
print("Precision : ",metrics.precision_score(ytrain,ypred2))

### Observation - Recall is not satisfactory
So we will now try other algorithms

# 8. Optimization and Tuning


1. At the data level
    * Extract more features
    * collect more data - features / samples
    * perform better preprocessing
    * improve feature selection process - drop irrelevant features
2. At the modelling level
    * Tune the hyperparameters of the algorithm to improve its performance
    * Change the ML algorithm used for modelling
    * combine multiple algorithms to make predictions

## Decision Tree Classifier

In [None]:
# preprocessing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
preprocessor = ColumnTransformer([('ohe',OneHotEncoder(),[1]),
                                ('ode',OrdinalEncoder(),[0])],
                                remainder="passthrough")
x_new = preprocessor.fit_transform(x)
pd.DataFrame(x_new)

In [None]:
# train test split
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x_new,y,test_size=0.2,random_state=5)
print(x.shape)
print(xtrain.shape)
print(xtest.shape)
print(y.shape)
print(ytrain.shape)
print(ytest.shape)

In [None]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier(random_state=5,class_weight={0:0.5,1:0.5})
model2.fit(xtrain,ytrain)

### Visualizing the tree model

In [None]:
import graphviz
from sklearn import tree

fname = ['International plan', 'vmail_NO_Plan','vmail_Normal','vmail_HF', 'Total day minutes',
       'Total eve minutes', 'Total night minutes', 'Total intl minutes',
       'Customer service calls']
cname = ['Not Leaving','Leaving']
graphdata = tree.export_graphviz(model2,feature_names=fname,class_names=cname,
                                filled=True,rounded=True)
graph = graphviz.Source(graphdata)
graph


In [None]:
# performance analysis
ypred2 = model2.predict(xtest)
print("Accuracy : ",metrics.accuracy_score(ytest,ypred2))
print("Recall : ",metrics.recall_score(ytest,ypred2))
print("F1 score : ",metrics.f1_score(ytest,ypred2))
print("Precision : ",metrics.precision_score(ytest,ypred2))

In [None]:

# performance analysis on train data
ypred2 = model2.predict(xtrain)
print("Accuracy : ",metrics.accuracy_score(ytrain,ypred2))
print("Recall : ",metrics.recall_score(ytrain,ypred2))
print("F1 score : ",metrics.f1_score(ytrain,ypred2))
print("Precision : ",metrics.precision_score(ytrain,ypred2))

## Overfitting

`performance of model on test data = low`

`performance of model on train data = high`

**Reasons for overfitting**
- Noisy features or noisy data
- lack of data - lack of number of observations/rows
- features are having very complex/nonlinear relation with label
- the algorithm used is very complex/nonlinear


**Ways to handle overfitting situation**
- drop noisy/irrelevant features
- keep the features simple, may be convert the numeric to categorical
- collect more data - more rows, NO BENEFIT from collecting more features
- Try a less complex algorithm
- In case of decision tree, decrease the value of max_depth, increase min_samples_leaf and min_samples_split


## Underfitting
`performance of model on test data = low`

`performance of model on train data = low`


**Reasons for underfitting**
- lack of informative features
- lack of a powerful algorithm, as the existing features may have silghtly complex/nonlinear relation with the target and the current algorithm is not able to learn
- presence of noisy observations


**Ways to handle underfitting situation**
- colllect/ create more features, perform feature extraction
- collect more columns, NO BENFIT from collecting rows
- Try a more powerful/complex predictive algorithm
- In case of deicision tree, increase the value of max_depth, decrease the value of min_samples_leaf and min_samples_split
- perform better data cleaning, handling outliers etc.

## Best fitting

`performance of model on test data = high`

`performance of model on train data = high`

## Hyperparameter Tuning for decision tree using Gridsearch

In [None]:
param_grid = {"max_depth":np.arange(3,25,2),
              "min_samples_leaf":np.arange(3,50,2),
              "min_samples_split":np.arange(10,120,5)}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=5),
                          param_grid=param_grid,n_jobs=-1,
                          scoring='recall',verbose=True,cv=5)
grid_search.fit(x_new,y)

In [None]:
print(grid_search.best_score_)
print(grid_search.best_params_)

In [None]:
# Controlling overfitting
model2 = DecisionTreeClassifier(criterion='gini',random_state=5,
                               max_depth=8,min_samples_leaf=5,min_samples_split=20)
model2.fit(xtrain,ytrain)


In [None]:
# performance analysis On test data
ypred2 = model2.predict(xtest)
print("Accuracy : ",metrics.accuracy_score(ytest,ypred2))
print("Recall : ",metrics.recall_score(ytest,ypred2))
print("F1 score : ",metrics.f1_score(ytest,ypred2))
print("Precision : ",metrics.precision_score(ytest,ypred2))

In [None]:
# performance analysis on train data
ypred2 = model2.predict(xtrain)
print("Accuracy : ",metrics.accuracy_score(ytrain,ypred2))
print("Recall : ",metrics.recall_score(ytrain,ypred2))
print("F1 score : ",metrics.f1_score(ytrain,ypred2))
print("Precision : ",metrics.precision_score(ytrain,ypred2))

### Feature importances

In [None]:
model2.feature_importances_
for i in range(len(fname)):print(fname[i],model2.feature_importances_[i])


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model4 = RandomForestClassifier(n_estimators=100,random_state=5,
                               max_depth=8,oob_score=True)
#train the model
model4.fit(xtrain,ytrain)

In [None]:
# performance analysis On test data
ypred2 = model4.predict(xtest)
print("Accuracy : ",metrics.accuracy_score(ytest,ypred2))
print("Recall : ",metrics.recall_score(ytest,ypred2))
print("F1 score : ",metrics.f1_score(ytest,ypred2))
print("Precision : ",metrics.precision_score(ytest,ypred2))

In [None]:
# performance analysis on train data
ypred2 = model4.predict(xtrain)
print("Accuracy : ",metrics.accuracy_score(ytrain,ypred2))
print("Recall : ",metrics.recall_score(ytrain,ypred2))
print("F1 score : ",metrics.f1_score(ytrain,ypred2))
print("Precision : ",metrics.precision_score(ytrain,ypred2))

In [None]:
#check OOB (out of bag) score
model4.oob_score_

## Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model5 = AdaBoostClassifier(n_estimators=120,random_state=5,learning_rate=0.2)
model5.fit(xtrain,ytrain)


In [None]:
# performance analysis On test data
ypred2 = model5.predict(xtest)
print("Accuracy : ",metrics.accuracy_score(ytest,ypred2))
print("Recall : ",metrics.recall_score(ytest,ypred2))
print("F1 score : ",metrics.f1_score(ytest,ypred2))
print("Precision : ",metrics.precision_score(ytest,ypred2))

In [None]:

# performance analysis on train data
ypred2 = model5.predict(xtrain)
print("Accuracy : ",metrics.accuracy_score(ytrain,ypred2))
print("Recall : ",metrics.recall_score(ytrain,ypred2))
print("F1 score : ",metrics.f1_score(ytrain,ypred2))
print("Precision : ",metrics.precision_score(ytrain,ypred2))

## Gradient Boosting Trees

In [None]:

# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
model6 = GradientBoostingClassifier(learning_rate=0.1,n_estimators=150,random_state=5)
model6.fit(xtrain,ytrain)


In [None]:

# performance analysis On test data
ypred2 = model6.predict(xtest)
print("Accuracy : ",metrics.accuracy_score(ytest,ypred2))
print("Recall : ",metrics.recall_score(ytest,ypred2))
print("F1 score : ",metrics.f1_score(ytest,ypred2))
print("Precision : ",metrics.precision_score(ytest,ypred2))

In [None]:
# performance analysis on train data
ypred2 = model6.predict(xtrain)
print("Accuracy : ",metrics.accuracy_score(ytrain,ypred2))
print("Recall : ",metrics.recall_score(ytrain,ypred2))
print("F1 score : ",metrics.f1_score(ytrain,ypred2))
print("Precision : ",metrics.precision_score(ytrain,ypred2))


## XGBosst

In [None]:
from xgboost import XGBClassifier
model7 = XGBClassifier(learning_rate=0.005,n_estimators=120,max_depth=8)
model7.fit(xtrain,ytrain)

In [None]:
# performance analysis On test data
ypred2 = model7.predict(xtest)
print("Accuracy : ",metrics.accuracy_score(ytest,ypred2))
print("Recall : ",metrics.recall_score(ytest,ypred2))
print("F1 score : ",metrics.f1_score(ytest,ypred2))
print("Precision : ",metrics.precision_score(ytest,ypred2))

In [None]:
# performance analysis on train data
ypred2 = model7.predict(xtrain)
print("Accuracy : ",metrics.accuracy_score(ytrain,ypred2))
print("Recall : ",metrics.recall_score(ytrain,ypred2))
print("F1 score : ",metrics.f1_score(ytrain,ypred2))
print("Precision : ",metrics.precision_score(ytrain,ypred2))

## Stacking
Using Decisoin Tree, Random Forest, Gradient Boosting as base learners, logistic regression as meta learner

In [None]:
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

log_model = LogisticRegression()
dt_model = DecisionTreeClassifier(random_state=5,max_depth=10)
rf_model = RandomForestClassifier(n_estimators=100,random_state=5,max_depth=10)
gb_model = GradientBoostingClassifier(learning_rate=0.01,n_estimators=120,random_state=5)


model8 = StackingClassifier(classifiers=[dt_model,rf_model,gb_model],
                           meta_classifier=log_model)
model8.fit(xtrain,ytrain)

In [None]:
# performance analysis On test data
ypred2 = model8.predict(xtest)
print("Accuracy : ",metrics.accuracy_score(ytest,ypred2))
print("Recall : ",metrics.recall_score(ytest,ypred2))
print("F1 score : ",metrics.f1_score(ytest,ypred2))
print("Precision : ",metrics.precision_score(ytest,ypred2))

In [None]:
# performance analysis on train data
ypred2 = model8.predict(xtrain)
print("Accuracy : ",metrics.accuracy_score(ytrain,ypred2))
print("Recall : ",metrics.recall_score(ytrain,ypred2))
print("F1 score : ",metrics.f1_score(ytrain,ypred2))
print("Precision : ",metrics.precision_score(ytrain,ypred2))