# Standard Imports of data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading base libraries

In [None]:
import pandas as pd
import numpy as np
import pylab as py
import scipy.optimize as opt
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import tree

In [None]:
# Open and define the data sets using the pandas' library:
df=pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

In [None]:
# We use several commands to check the loaded date set:
# Show the first 5 data to check out the data
df.head()

In [None]:
# Show the columns names of the data sets and type of data
df.columns

In [None]:
# Show the columns names of the data sets and type of data and amount 
# of non-null(not missed) values.
df.info()

In [None]:
# For a column in the data set columns show and print column name - number of 
# unique values and number of total values. If there are too many unique values 
# out of total numbers, we can easily drop out this variable.
# But our data set have good quolity and a lots of dummy variables =>
# we leave it unchanged for now.
df.info()
for col in df.columns:
    print(col, df[col].nunique(), len(df))

In [None]:
# For each of the variables we need to check the mistakes.
# For that we print all unique values and look for mistakes.
# As we can see, all values are okay and no mistake exists
for col in df.columns:
    print(col, df[col].unique(), len(df))

In [None]:
# Next, let's collect descriptive statistics for each variable: 
statistic = df.describe(include='all')
print(statistic)
# Results we can see in 'variable explorer'.
# Based on this, in the future, we will need to normalize or standardize 
# the data (optional) in order for the analysis to be more accurate.

# Pearson correlation

In [None]:
# Next, we use correlation, in this case Pearson. We also build a correlation 
# matrix to display the result. Correlation analysis will help us determine 
# whether variables interact strongly with each other or not. If variables 
# interact strongly with each other, it will damage the analysis and make it 
# non-faithful. But as we can observe, all the variables do not 
# influence each other much.

In [None]:
import seaborn as sns
corr = df.corr(method ='pearson')
ax = sns.heatmap(
    corr, vmin=-1, vmax=1, center=0,cmap=sns.diverging_palette(10, 200, n=200),square=True)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45, horizontalalignment='right');

# Data visualization before analysis

Looking at binaries independent variable in the same plot with the dependent variable.

In [None]:
fig ,ax = plt.subplots(2,2,figsize=(32,32))
ax1,ax2,ax3,ax4 = ax.flatten()
sns.countplot(data=df,x='anaemia',hue='DEATH_EVENT',palette='viridis',ax=ax1)
sns.countplot(data=df,x='diabetes',hue='DEATH_EVENT',palette='Set1_r',ax=ax2)
sns.countplot(data=df,x='high_blood_pressure',hue='DEATH_EVENT',palette='gist_ncar_r',ax=ax3)
sns.countplot(data=df,x='smoking',hue='DEATH_EVENT',palette='autumn_r',ax=ax4)

Looking at others independent variable in the same plot with the dependent variable.

In [None]:
plt.figure(figsize=(32,16))
sns.countplot(data=df,x='age',hue='DEATH_EVENT',palette='gist_rainbow')

In [None]:
plt.figure(figsize=(32,16))
sns.countplot(data=df,x='serum_creatinine',hue='DEATH_EVENT',palette='YlGnBu')

In [None]:
plt.figure(figsize=(32,16))
sns.countplot(data=df,x='serum_sodium',hue='DEATH_EVENT',palette='Oranges_r')

In [None]:
plt.figure(figsize= (16,4))
sns.countplot(data=df,x='sex',hue='DEATH_EVENT',palette='Set2_r')

Make a visualisation of 'age' variable in the same plot with the dependent variable.

In [None]:
fig,ax = plt.subplots(2,2,figsize=(16,16))
ax1,ax2,ax3,ax4 = ax.flatten()
sns.distplot(df['age'],bins=20,color='r',ax=ax1)
sns.boxplot(y='age',x='DEATH_EVENT',data=df,ax=ax2)
sns.pointplot(y='age',x='DEATH_EVENT',data=df,ax=ax3)
sns.violinplot(y='age',x='DEATH_EVENT',data=df,ax=ax4)

In [None]:
male = df[df["sex"]==1]
female = df[df["sex"]==0]

male_survi = male[df["DEATH_EVENT"]==0]
male_not = male[df["DEATH_EVENT"]==1]
female_survi = female[df["DEATH_EVENT"]==0]
female_not = female[df["DEATH_EVENT"]==1]

labels = ['Male - Survived','Male - Not Survived', "Female -  Survived", "Female - Not Survived"]
values = [len(male[df["DEATH_EVENT"]==0]),len(male[df["DEATH_EVENT"]==1]),
         len(female[df["DEATH_EVENT"]==0]),len(female[df["DEATH_EVENT"]==1])]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(
    title_text="Analysis on Survival - Gender")
fig.show()

# Split data to test and train

In [None]:
# We need to predict the chance of DEATH_EVENT => for that goal we have to use
# logistic regression. Types of Logistic Regression:
# 1. Binary Logistic Regression: The target variable has 
# only two possible outcomes.
# 2. Multinomial Logistic Regression: The target variable has three or more 
# nominal categories.
# 3. Ordinal Logistic Regression: the target variable has three or 
# more ordinal categories. 
# Hence our regression method is obviously Binary Logistic Regression.

In [None]:
# Is to divide all variables into two groups: target variable(dependent) and others (independent) variables:
feature_cols = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 
                'ejection_fraction', 'high_blood_pressure', 'platelets', 
                'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']
X = df[feature_cols] # Features
y = df.DEATH_EVENT # Target variable

In [None]:
# Understand model performance, dividing the dataset into
# a training set and a test set
# split X and y into training and testing sets.
# Here, the Dataset is broken into two parts in a ratio of 80:20.
# It means 80% data will be used for model training 
# and 20% for model testing.

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=0)

Making a correlation between X train (independent) variables.

In [None]:
cor = X_train.corr()
plt.figure(figsize=(12,6))
sns.heatmap(cor,cmap='Set1',annot=True)

# Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_st = scaler.fit_transform(X_train)
X_test_st = scaler.fit_transform(X_test)

In [None]:
X_train_st

In [None]:
X_test_st

# Logistic Regression

In [None]:
# First, import the Logistic Regression module and 
# create a Logistic Regression classifier object using
# LogisticRegression() function.
# Then, fit our model on the train set using fit() and
# perform prediction on the test set using predict().

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=0)
logreg.fit(X_train_st,y_train)
y_pred=logreg.predict(X_test_st)
print(y_pred)

In [None]:
logreg.coef_

In [None]:
# A confusion matrix is a table that is used to evaluate the performance 
# of a classification model. We can also visualize the performance of an 
# algorithm. The fundamental of a confusion matrix is the number of correct
# and incorrect predictions are summed up class-wise.
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
# Create a matrix 2x2 with a code:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
# Confusion Matrix Evaluation Metrics
# evaluate the model using model evaluation metrics 
# such as accuracy, precision, and recall
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
# ROC Curve:
# Receiver Operating Characteristic(ROC) curve is a plot of the true 
# positive rate against the false positive rate. It shows the tradeoff 
# between sensitivity and specificity.
y_pred_proba = logreg.predict_proba(X_test_st)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc_logreg = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc_logreg))
plt.legend(loc=4)
plt.show()

In [None]:
# ROC Curve: 0.857
# AUC score for the case is 0.857. AUC score 1 represents perfect classifier, 
# and 0.5 represents a worthless classifier.

# Random Forest Classifier

In [None]:
help(RandomForestClassifier)

In [None]:
from sklearn.ensemble import RandomForestClassifier
r_clf = RandomForestClassifier(max_features=0.5, max_depth=10, random_state=0)
r_clf.fit(X_train_st, y_train)
r_pred = r_clf.predict(X_test_st)

In [None]:
print(r_clf.max_depth)

In [None]:
print(r_clf.max_features)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, r_pred))
print("Precision:",metrics.precision_score(y_test, r_pred))
print("Recall:",metrics.recall_score(y_test, r_pred))

In [None]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, r_pred)
cnf_matrix
import matplotlib.pyplot as plt
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba_r_clf = r_clf.predict_proba(X_test_st)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba_r_clf)
auc_r_clf = metrics.roc_auc_score(y_test, y_pred_proba_r_clf)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc_r_clf))
plt.legend(loc=4)
plt.show()

# Decision Tree Classifier

In [None]:
help(DecisionTreeClassifier)

In [None]:
decision_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth = 10, random_state=0)
decision_tree.fit(X_train_st, y_train)
d_pred = decision_tree.predict(X_test_st)
print("Accuracy:",metrics.accuracy_score(y_test, d_pred))
print("Precision:",metrics.precision_score(y_test, d_pred))
print("Recall:",metrics.recall_score(y_test, d_pred))

In [None]:
print(decision_tree.tree_.max_depth)

In [None]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, d_pred)
cnf_matrix
import matplotlib.pyplot as plt
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba_decision_tree = decision_tree.predict_proba(X_test_st)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba_decision_tree)
auc_decision_tree = metrics.roc_auc_score(y_test, y_pred_proba_decision_tree)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc_decision_tree))
plt.legend(loc=4)
plt.show()

# Visualizing the Decision Tree

In [None]:
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw
import graphviz  
from sklearn.tree import export_graphviz

# Export our trained model as a .dot file
with open("tree1.dot", 'w') as f:
     f = export_graphviz(decision_tree, out_file=f, max_depth = 10,
                         impurity = True, feature_names = X_train.columns,
                         rounded = True, filled= True )
#Convert .dot to .png to allow display in web notebook
check_call(['dot','-Tpng','tree1.dot','-o','tree.png'])
# Annotating chart with PIL
img = Image.open("tree.png")
draw = ImageDraw.Draw(img)
img.save('sample-out.png')
PImage("sample-out.png")

# Validation

In [None]:
from sklearn.model_selection import GridSearchCV,StratifiedKFold
logreg5 = LogisticRegression(class_weight='balanced')
param = {'C':[0.001,0.003,0.005,0.01,0.03,0.05,0.08, 0.1, 0.3,0.5,1,2,3,3,4,5,10,20]}
clf = GridSearchCV(logreg5,param,scoring='roc_auc',refit=True,cv=10)
clf.fit(X_train_st,y_train)
print('Best roc_auc: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_))

In [None]:
from sklearn.linear_model import LogisticRegression
logreg2 = LogisticRegression(C=0.1)
logreg2.fit(X_train_st,y_train)
y_pred2=logreg2.predict(X_test_st)
print(y_pred2)

In [None]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred2)
cnf_matrix

In [None]:
import matplotlib.pyplot as plt
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred2))
print("Precision:",metrics.precision_score(y_test, y_pred2))
print("Recall:",metrics.recall_score(y_test, y_pred2))

In [None]:
y_pred_proba_logreg2 = logreg2.predict_proba(X_test_st)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba_logreg2)
auc_logreg2 = metrics.roc_auc_score(y_test, y_pred_proba_logreg2)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc_logreg2))
plt.legend(loc=4)
plt.show()

# L1 Regularization/Lasso regression

In [None]:
#Since it provides sparse solutions, it is generally the model of choice (or some variant of this concept) for modelling cases where the 
#features are in millions or more. In such a case, getting a sparse solution is of great computational advantage as the features 
#with zero coefficients can simply be ignored.
#It arbitrarily selects any one feature among the highly correlated ones and reduced the coefficients of the rest to zero. 
#Also, the chosen variable changes randomly with change in model parameters. This generally doesn’t work that well as compared to ridge regression.

In [None]:
from sklearn.linear_model import LogisticRegression
log_l1 = LogisticRegression(random_state=0, penalty='l1', solver='saga')
log_l1.fit(X_train_st,y_train)
y_pred_l1=log_l1.predict(X_test_st)
print(y_pred_l1)

In [None]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred_l1)
cnf_matrix

In [None]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_l1))
print("Precision:",metrics.precision_score(y_test, y_pred_l1))
print("Recall:",metrics.recall_score(y_test, y_pred_l1))

In [None]:
y_pred_proba_log_l1 = log_l1.predict_proba(X_test_st)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba_log_l1)
auc_log_l1 = metrics.roc_auc_score(y_test, y_pred_proba_log_l1)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc_log_l1))
plt.legend(loc=4)
plt.show()

In [None]:
log_l1.coef_

# L2 Regularization/Ridge regression

In [None]:
#Ridge: It is majorly used to prevent overfitting. 
#Since it includes all the features, it is not very useful in case of exorbitantly high #features, say in millions, as it will pose computational challenges.
#It generally works well even in presence of highly correlated features as it will include all of them in the model
#but the coefficients will be distributed among them depending on the correlation.

In [None]:
from sklearn.linear_model import LogisticRegression
log_l2 = LogisticRegression(random_state=0, penalty='l2', solver='saga')
log_l2.fit(X_train_st,y_train)
y_pred_l2=log_l2.predict(X_test_st)
print(y_pred_l2)

In [None]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred_l2)
cnf_matrix

In [None]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_l2))
print("Precision:",metrics.precision_score(y_test, y_pred_l2))
print("Recall:",metrics.recall_score(y_test, y_pred_l2))

In [None]:
y_pred_proba_log_l2 = log_l2.predict_proba(X_test_st)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba_log_l2)
auc_log_l2 = metrics.roc_auc_score(y_test, y_pred_proba_log_l2)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc_log_l2))
plt.legend(loc=4)
plt.show()

In [None]:
log_l2.coef_

Lasso regression

In [None]:
from sklearn.linear_model import Lasso
from sklearn import linear_model
lasso = Lasso()
lasso.fit(X_train_st,y_train)
train_score=lasso.score(X_train_st,y_train)
test_score=lasso.score(X_test_st,y_test)
coeff_used = np.sum(lasso.coef_!=0)

print ("Training score:", train_score) 
print ("Test score:", test_score)
print ("Number of features used:", coeff_used)
print ("____________________________________________________________________________")
print()


lasso001 = Lasso(alpha=0.01, max_iter=10e5)
lasso001.fit(X_train_st,y_train)
train_score001=lasso001.score(X_train_st,y_train)
test_score001=lasso001.score(X_test_st,y_test)
coeff_used001 = np.sum(lasso001.coef_!=0)

print ("Training score for alpha = 0.01:", train_score001)
print ("Test score for alpha = 0.01:", test_score001)
print ("Number of features used: for alpha = 0.01:", coeff_used001)
print ("____________________________________________________________________________")
print()


lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
lasso00001.fit(X_train_st,y_train)
train_score00001=lasso00001.score(X_train_st,y_train)
test_score00001=lasso00001.score(X_test_st,y_test)
coeff_used00001 = np.sum(lasso00001.coef_!=0)

print ("Training score for alpha = 0.0001:", train_score00001)
print ("Test score for alpha = 0.0001:", test_score00001)
print ("Number of features used: for alpha = 0.0001:", coeff_used00001)
print ("____________________________________________________________________________")
print()


lr = LogisticRegression()
lr.fit(X_train_st,y_train)
lr_train_score=lr.score(X_train_st,y_train)
lr_test_score=lr.score(X_test_st,y_test)
print ("Logistic Regression training score:", lr_train_score)
print ("Logistic Regression test score:", lr_test_score)
print ("____________________________________________________________________________")
print()

lr = LogisticRegression(random_state=0)
lr.fit(X_train_st,y_train)
# plot
plt.subplot(1,2,1)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=6,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=7,color='blue',label=r'Lasso; $\alpha = 0.01$')

plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.subplot(1,2,2)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=6,color='red',label=r'Lasso; $\alpha = 1$',zorder=7)
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=7,color='blue',label=r'Lasso; $\alpha = 0.01$')
plt.plot(lasso00001.coef_,alpha=0.8,linestyle='none',marker='v',markersize=7,color='black',label=r'Lasso; $\alpha = 0.00001$')
plt.plot(lr.coef_,alpha=0.7,linestyle='none',marker='o',markersize=6,color='green',label='Linear Regression',zorder=2)
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=6,loc=4)
plt.tight_layout()
plt.show()

# Comparison of different Classifier Models

In [None]:
accuracy_list = []

In [None]:
accuracy_list.append(auc_logreg)
accuracy_list.append(auc_r_clf)
accuracy_list.append(auc_decision_tree)
accuracy_list.append(auc_logreg2)
accuracy_list.append(auc_log_l1)
accuracy_list.append(auc_log_l2)

In [None]:
model_list=['Logistic Regression','Random Forest Classifier','Decision Tree Classifier','LR validation','LR L1','LR L2']

In [None]:
plt.rcParams['figure.figsize']=20,8
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y=accuracy_list, palette = "husl", saturation =2)
plt.xlabel('Classifier Models', fontsize = 20 )
plt.ylabel('Value of ROC', fontsize = 20)
plt.title('Receiver Operating Characteristic(ROC) value of different Classifier Models', fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 12)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,4)}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()

In [None]:
accuracy_list2 = []

In [None]:
accuracy_list2.append(metrics.accuracy_score(y_test, y_pred))
accuracy_list2.append(metrics.accuracy_score(y_test, r_pred))
accuracy_list2.append(metrics.accuracy_score(y_test, d_pred))
accuracy_list2.append(metrics.accuracy_score(y_test, y_pred2))
accuracy_list2.append(metrics.accuracy_score(y_test, y_pred_l1))
accuracy_list2.append(metrics.accuracy_score(y_test, y_pred_l2))

In [None]:
model_list2=['Logistic Regression','Random Forest Classifier','Decision Tree Classifier','LR validation','LR L1','LR L2']

In [None]:
plt.rcParams['figure.figsize']=20,8
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list2, y=accuracy_list2, palette = "husl", saturation = 2)
plt.xlabel('Classifier Models', fontsize = 20 )
plt.ylabel('Value of accurancy', fontsize = 20)
plt.title('Accurancy value of different Classifier Models', fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 12)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,4)}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()

In [None]:
my_list_coef = []

In [None]:
my_list_coef.append(logreg.coef_)
my_list_coef.append(logreg2.coef_)
my_list_coef.append(log_l1.coef_)
my_list_coef.append(log_l2.coef_)

In [None]:
my_list_coef.insert(0, 'Coef for Logistic Regression')
my_list_coef.insert(2, 'Coef for Logistic Regression with validation')
my_list_coef.insert(4, 'Coef for Logistic Regression with L1 penalty')
my_list_coef.insert(6, 'Coef for Logistic Regression with L2 penalty')

In [None]:
my_list_coef

In [None]:
import itertools

column_names = ['Coef for Logistic Regression','Coef for Logistic Regression with validation','Coef for Logistic Regression with L1 penalty',
               'Coef for Logistic Regression with L2 penalty']
values = [logreg.coef_, logreg2.coef_, log_l1.coef_, log_l2.coef_]

L = zip(itertools.cycle(column_names), values)

for g, v in itertools.groupby(sorted(L), lambda x: x[0]):
    print("{} = {}".format(g, [i[1] for i in v]))