In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
path = "/content/drive/MyDrive/Data Science/Project-41 Heart Attack Risk Prediction Using Eval ML (Auto ML)/heart.csv"
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# checking datatype and null values
df.info()

In [None]:
df.describe()

In [None]:
sns.set_style('whitegrid')

In [None]:
plt.figure(figsize=(14, 8), dpi=150)
sns.heatmap(df.corr(), annot=True, cmap='viridis', linewidths=0.5)
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=df['age'])
plt.show()

In [None]:
df.groupby('output')['sex'].value_counts()

In [None]:
sns.countplot(data=df, x='sex', hue='output')
plt.show()

In [None]:
df.groupby('output')['exng'].value_counts()

In [None]:
sns.countplot(data=df, x='exng', hue='output')
plt.show()

In [None]:
df.groupby('output')['cp'].value_counts()

In [None]:
sns.countplot(x=df['cp'], hue=df['output'])
plt.show()

In [None]:
df.groupby('output')['restecg'].value_counts()

In [None]:
sns.countplot(x=df['restecg'], hue=df['output'])
plt.show()

In [None]:
# sns.pairplot(df, hue='output', data=df)

## Let's See for our countinuous Variable

In [None]:
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(df['trtbps'], kde=True, color='magenta')
plt.xlabel('Resting blood pressure(mmgh)')
plt.subplot(1, 2, 2)
sns.histplot(df['thalachh'], kde=True, color='teal')
plt.xlabel('Maximun heart Rate Achived (bpm)')
plt.show()

In [None]:
plt.figure(figsize=(14, 6))
sns.histplot(df['chol'], kde=True, color='magenta')
plt.xlabel('blood cholestrol')
plt.show()

In [None]:
df.head()

## spliting data & Standardisation data

In [None]:
# spliting data into dependent and independent dataset
x = df.drop('output', axis=1)
y = df['output'].values

In [None]:
# standardisation data
scaler = StandardScaler().fit(x)
x = scaler.transform(x)

In [None]:
# splitting in training and testing datasets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, 
                                                    random_state=44)

## Buiding the model and evaluation

In [None]:
# importing models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
all_models = [LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, SVC]

In [None]:
def getModel_Score(x):
  model = x().fit(x_train, y_train)
  print(type(model).__name__)
  print('The Training Score is', model.score(x_train, y_train))
  print('The Testing Score is', model.score(x_test, y_test))
  return model

In [None]:
model_dict = {}

for i in all_models:
  model = getModel_Score(i)
  model_dict[type(model).__name__] = model
  print('-----'*20)

In [None]:
# k-near
from sklearn.neighbors import KNeighborsClassifier

In [None]:
error_rate = []
for i in range(1, 100):
  knn = KNeighborsClassifier(n_neighbors=i).fit(x_train, y_train)
  pred = knn.predict(x_test)
  error_rate.append(np.mean(pred != y_test))

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, 100), error_rate, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', 
         markersize=10)
plt.xlabel('k near')
plt.ylabel('Error rate')
plt.title('To check the correct values of k')
plt.show()

In [None]:
# k-near

knn = KNeighborsClassifier(n_neighbors=10).fit(x_train, y_train)
pred = knn.predict(x_test)

In [None]:
print('Training Score', knn.score(x_train, y_train))
print('Testing Score', metrics.accuracy_score(y_test, pred))

In [None]:
print(metrics.classification_report(y_test, pred))

In [None]:
metrics.confusion_matrix(y_test, pred)

## Adaboost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
adab = AdaBoostClassifier(n_estimators=200, algorithm='SAMME',
                          learning_rate=0.001, random_state=0)

In [None]:
adab.fit(x_train, y_train)

In [None]:
adab.score(x_train, y_train)

In [None]:
y_pred = adab.predict(x_test)

In [None]:
print('Testing Score', metrics.accuracy_score(y_test, y_pred))

## HyperParameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
  'C': [100, 10, 1.0, 0.01],
 'max_iter': [100, 200, 300, 500],
 'penalty': ['none', 'l1', 'l2', 'elasticnet'],
 'solver': ['lbfgs','sag', 'saga', 'newton-cg']
}

In [None]:
grid1 = GridSearchCV(LogisticRegression(), params)

In [None]:
grid1.fit(x_train, y_train)

In [None]:
grid1.best_params_

In [None]:
grid1.best_score_

## Final Vardict

**K near noubors give best result**

In [None]:
lr_metric = metrics.confusion_matrix(y_test, pred)

In [None]:
# building proper matrics

options = ['Disease', 'No Disease']

fig, ax = plt.subplots()
im = ax.imshow(lr_metric, cmap='Set3', interpolation='nearest')

ax.set_xticks(np.arange(len(options)))
ax.set_yticks(np.arange(len(options)))

ax.set_xticklabels(options)
ax.set_yticklabels(options)

plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')

for i in range(len(options)):
  for j in range(len(options)):
    text = ax.text(j, i, lr_metric[i, j],
                   ha='center', va='center', color="black")
    
  ax.set_title('Confusion Metrics of k_near')
  plt.tight_layout()
  plt.xlabel('model Prediction')
  plt.ylabel('Actual Prediction')
  fig.show()