In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('e:/data/churn.csv')

In [None]:
# Assuming we already have done detailed analysis on this data

In [None]:
# Class distribution of the taget variable

In [None]:
df['Exited'].value_counts()

In [None]:
x = df.iloc[:,2:12]

In [None]:
y = df['Exited']

In [None]:
dummies = pd.get_dummies(data=df, columns=['Geography','Gender'], drop_first=True)

In [None]:
dummies[:2]

In [None]:
dummies.columns

In [None]:
x=pd.concat([x,dummies[['Geography_Germany', 'Geography_Spain', 'Gender_Male']]], axis=1)

In [None]:
x.drop(['Geography','Gender'], axis=1, inplace=True)

In [None]:
x.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
ytrain.value_counts()

In [None]:
6346/8000

In [None]:
1654/2000

In [None]:
# Generalised Linear models

In [None]:
# Logistic regression --> Logit function (sigmoid function) --> odds

In [None]:
p/(1-p)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
glm = LogisticRegression(max_iter=10000,solver="liblinear")

In [None]:
lrmodel = glm.fit(xtrain,ytrain)

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
# ---- Predict probabilities ----
y_prob = lrmodel.predict_proba(xtest)[:, 1]
fpr, tpr, thresholds = roc_curve(ytest, y_prob)
roc_auc = auc(fpr, tpr)
print("AUC Score:", roc_auc)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Guess')
plt.xlabel("False Positive Rate") 
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
lrpred= lrmodel.predict(xtest)

In [None]:
pd.crosstab(ytest,lrpred)

In [None]:
(1591+13)/2000

In [None]:
13/(13+26)

In [None]:
13/(13+370)

In [None]:
lrmodel.predict(xtest[:1])

In [None]:
lrmodel.predict_proba(xtest[:1])

In [None]:
ytest[:1]

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [None]:
dtree = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=5, max_leaf_nodes=8, random_state=8)

In [None]:
dtreemodel = dtree.fit(xtrain,ytrain)

In [None]:
# Graphviz
# use dtreeviz, plotly, dtreeplt for advanced visualisation

In [None]:
from dtreeplt import dtreeplt

In [None]:
x.columns

In [None]:
plt.figure(figsize=(20, 10))
plot_tree(
    dtreemodel,
    filled=True,
    feature_names=x.columns,
    class_names=['pos','neg'],
    rounded=True,
    fontsize=10
)
plt.show()

In [None]:
dtree_pred = dtreemodel.predict(xtest)

In [None]:
pd.crosstab(ytest,dtree_pred)

In [None]:
92/(92+24)

In [None]:
# Precision --> tp/(tp+fp)
146/(146+63)

In [None]:
# Recall --> tp/(tp+fn)
146/(237+146)

In [None]:
# Accuracy
(146+1554)/2000

In [None]:
# Pruning

In [None]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [None]:
from sklearn import tree

In [None]:
params = {'max_depth': [4,6,8,10],
         'min_samples_split': [2,5,8],
         'min_samples_leaf': [4,10]}

clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=dtreemodel,param_grid=params)
gcv.fit(xtrain,ytrain)

In [None]:
gcv.cv_results_

In [None]:
gcv.best_params_

In [None]:
gcv.best_score_

In [None]:
path = dtree.cost_complexity_pruning_path(xtrain,ytrain)

In [None]:
alpha = path['ccp_alphas']

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report

In [None]:
ytrain.pred

In [None]:
alpha

In [None]:
acc_train, acc_test = [],[]

for i in alpha:
    tree = DecisionTreeClassifier(ccp_alpha=i)
    tree.fit(xtrain,ytrain)
    ytrain_pred = tree.predict(xtrain)
    ytest_pred = tree.predict(xtest)
    
    acc_train.append(accuracy_score(ytrain,ytrain_pred))
    acc_test.append(accuracy_score(ytest,ytest_pred))
    acc_train.append(precision_score(ytrain,ytrain_pred))
    acc_test.append(precision_score(ytest,ytest_pred))

In [None]:
(92)/(92+291)

In [None]:
acc_train

In [None]:
acc_test

In [None]:
m1 = DecisionTreeClassifier(ccp_alpha=0.00522287, max_depth=4)
m1.fit(xtrain,ytrain)

In [None]:
p=m1.predict(xtest)

In [None]:
pd.crosstab(ytest,p)

In [None]:
print(classification_report(ytest,p))

In [None]:
precision_score(ytest,p)

In [None]:
# Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
rfmodel = rf.fit(xtrain,ytrain)

In [None]:
rfpred = rfmodel.predict(xtest)

In [None]:
rfmodel.feature_importances_

In [None]:
x.columns

In [None]:
print(classification_report(ytest, rfpred))

In [None]:
pd.crosstab(ytest,rfpred)

In [None]:
1556/(1556+216)

In [None]:
# Data Preparation Techniques

# Data Pre-processing : Sampling, Data Transformation --> Standardisation, Normalisation
# Cross-validation Techniques : K-fold, Stratified K-fold, Repeated K-fold
# Feature Selection Techniques : P-value, Step function, K-best, RFE, AUC/ROC
# Handling Class Imbalances : Under Sampling, Over Sampling, SMOTE
# Feature Extraction Techniques/Dimentionality reduction : Curse of Dimensionality : PCA, LDA

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
df['EstimatedSalary'][:5]

In [None]:
(101348.88-df['EstimatedSalary'].mean())/df['EstimatedSalary'].std()

In [None]:
x[:1]

In [None]:
scaler = StandardScaler().fit(x.values)
rescaled = scaler.transform(x.values)

In [None]:
rescaled[:1]

In [None]:
norm = MinMaxScaler().fit_transform(x.values)

In [None]:
norm[:1]

In [None]:
# Cross Validation Techniques

In [None]:
# K-fold cv - 3 fold cv

In [None]:
# 10,000

# 1 fold --> 3500
# 2 fold --> 3250
# 3 fold --> 3250

# 1 model : 1st & 2nd chunk (training) and predicting on the 3rd chunk        --> 85%

# 2nd model : 1st & 3rd chunnk (training) and predicting on the 2nd chunk     --> 75%

# 3rd model : 2nd & 3rd (training) and predicting on the 1st chunk            --> 80%

# average of all the models accuarcy

In [None]:
# Feature Selection Techniques

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, RFE

In [None]:
lr = LogisticRegression(max_iter=10000,solver="liblinear")

In [None]:
rfe_lr = RFE(estimator=lr, n_features_to_select=5)
result = rfe_lr.fit(xtrain,ytrain)

In [None]:
result.ranking_

In [None]:
xtrain.columns

In [None]:
result.n_features_in_

In [None]:
result.support_

In [None]:
# Chi2 with Kbest

In [None]:
model = SelectKBest(score_func=chi2,k=5)
result_kbest = model.fit(xtrain,ytrain)

In [None]:
result_kbest.scores_

In [None]:
for i in result_kbest.scores_:
    print(round(i,2))

In [None]:
result_kbest.get_support()

In [None]:
SMOTE

In [None]:
df[df['Exited']==1][:6]

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
s = SMOTE()

In [None]:
x_train_smote,y_train_smote = s.fit_resample(xtrain,ytrain)

In [None]:
from collections import Counter

In [None]:
print("before SMOTE:", Counter(ytrain))
print("After SMOTE:", Counter(y_train_smote))

In [None]:
lr.fit(x_train_smote,y_train_smote)

In [None]:
pred_glm = lr.predict(xtest)

In [None]:
pd.crosstab(ytest,pred_glm)

In [None]:
232/(510+510)

In [None]:
232/(232+151)

In [None]:
print(classification_report(ytest,pred_glm))

In [None]:
# Evaluation of Algorithms

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [None]:
models = []

In [None]:
models.append(("LR", LogisticRegression()))
models.append(("Tree", DecisionTreeClassifier(max_depth=4, min_samples_leaf=2, min_samples_split=2, ccp_alpha=0.00522287)))
models.append(("SVM", SVC(kernel='sigmoid')))
models.append(("RF", RandomForestClassifier()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("NB", GaussianNB()))
models.append(("XGB", XGBClassifier()))

In [None]:
models

In [None]:
results = []
names = []

In [None]:
from sklearn.model_selection import KFold , cross_val_score, GridSearchCV

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
for name,model in models:
    kfold = KFold(n_splits=3)
    cv_results = cross_val_score(model, xtrain,ytrain,cv=kfold, scoring="precision")
    results.append(cv_results)
    names.append(name)
    
    outcome = "%s: %f (%f)" % (name,cv_results.mean()*100, cv_results.std()*100)
    print(outcome)

In [None]:
results

In [None]:
fig = plt.figure()
axis = fig.add_subplot(111)
plt.boxplot(results)
axis.set_xticklabels(names)
plt.show()

In [None]:
# Principal Component Analysis (PCA)

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Wine.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

# Training the Logistic Regression model on the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],1
                c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()