In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import matplotlib.colors as colors
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import scale 
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.decomposition import PCA

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df=pd.read_csv('../input/heart-disease-uci/heart.csv')
df.head()

# EDA using pandas_profiling

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
profile

In [None]:
df.dtypes

In [None]:
df.describe()

**trestbps**, **chol** and **thalach** have different range values compare to other features, so need to apply a feature scaling technique, so it would scale all features so that they are in the same range, for example 0–1 or -1 to 1.

In [None]:
X=df.drop('target',axis=1).copy()
X.head()

In [None]:
y=df['target'].copy()

## **Feature Scaling**

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)
X_train_scaled=scale(X_train)
X_test_scaled=scale(X_test)

In [None]:
len(X_test_scaled)

## **Modeling**

In [None]:
clf_svm=SVC(random_state=42)
clf_svm.fit(X_train_scaled,y_train)

In [None]:
plot_confusion_matrix(clf_svm,X_test_scaled,y_test,display_labels=["Does not have Heart Disease","Has HD"])

# Optimizing Parameters with Cross Validation

In [None]:
param_grid=[{'C':[0.1,1, 10, 100],'gamma':[1,0.1,0.01,0.001],'kernel':['rbf', 'poly', 'sigmoid']},]
optimal_params=GridSearchCV(SVC(),param_grid,verbose=2)
optimal_params.fit(X_train_scaled,y_train)
optimal_params.best_params_

Selecting Best Parameters

C: 100

gamma: 0.001

kernel: sigmoid


In [None]:
clf_svm=SVC(kernel='sigmoid',C= 100, gamma= 0.001)
clf_svm.fit(X_train_scaled,y_train)
plot_confusion_matrix(clf_svm,X_test_scaled,y_test,display_labels=["Does not have Heart Disease","Has HD"])


# **Visualization**

Used PCA (Principal Component Analysis) to combine the 14 features into 2 orthogonal meta-features that can use as axes for a graph.


In [None]:
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)

per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)

labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

The scree plot shows that the first principal component, PC1, accounts for a relatively large amount of variation in the raw data, and this means that it will be a good candidate for the x-axis in the 2-dimensional graph. Since PC2 accounts for the next largest amount of variance, can be used for the y-axis.

In [None]:
pc1 = X_train_pca[:, 0] 
pc2 = X_train_pca[:, 1]
#pc1 contains the x-axis coordinates of the data after PCA
#pc2 contains the y-axis coordinates of the data after PCA

clf_svm.fit(np.column_stack((pc1, pc2)), y_train)

#Creating a matrix of points that can use to show the decision regions.
x_min = pc1.min() - 1
x_max = pc1.max() + 1

y_min = pc2.min() - 1
y_max = pc2.max() + 1


xx, yy = np.meshgrid(np.arange(start=x_min, stop=x_max, step=0.1), np.arange(start=y_min, stop=y_max, step=0.1))

#Classifying every point in that matrix with the SVM. Points on one side of the classification boundary will get 0, and points on the other side will get 1
Z = clf_svm.predict(np.column_stack((xx.ravel(), yy.ravel())))

#Reshaping so that each classification (0 or 1) corresponds to a specific point in the matrix.
Z = Z.reshape(xx.shape)


fig, ax = plt.subplots(figsize=(10,10))
ax.contourf(xx, yy, Z, alpha=0.1)

#Custom colors for the actual data points
cmap = colors.ListedColormap(['#e41a1c', '#4daf4a'])
 
scatter = ax.scatter(pc1, pc2, c=y_train, cmap=cmap, s=100, edgecolors='k', alpha=0.7)

legend = ax.legend(scatter.legend_elements()[0],scatter.legend_elements()[1],loc="upper right")
legend.get_texts()[0].set_text("No HD")
legend.get_texts()[1].set_text("Yes HD")

ax.set_ylabel('PC2')
ax.set_xlabel('PC1')
ax.set_title('Decison surface using the PCA transformed/projected features')
plt.show()