In [None]:
#importing basic libraries for Data Preparation amd EDA
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pwd

### **Attribute Information**
<pre>
1. Age: age of the patient [years]
2. Sex: sex of the patient [M: Male, F: Female]
3. ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
4. RestingBP: resting blood pressure [mm Hg]
5. Cholesterol: serum cholesterol [mm/dl]
6. FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
7. RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
8. MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]
9. ExerciseAngina: exercise-induced angina [Y: Yes, N: No]
10. Oldpeak: oldpeak = ST [Numeric value measured in depression]
11. ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
12. HeartDisease: output class [1: heart disease, 0: Normal]
</pre>

### Data Preparation and Cleaning

In [None]:
data = pd.read_csv('./heart.csv')

In [None]:
data.head()

In [None]:
al = data['Age'].unique().tolist()
for i in al:
    a01=data['Age']==i
    a1=data['HeartDisease']==1
    ac01 =  data[a01].shape[0]
    ac1=data[a01 & a1].shape[0]

In [None]:
data['FastingBS'].unique()

In [None]:
l1=data[data['FastingBS']==1].shape[0]
la=data.shape[0]
l1/la*100

1/4th of the totalis a considerable count therefore its not required to remove th column FastingBS.

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.isnull().any()

In [None]:
data.info()

### This shows there is no null values present in the dataset

 ## --------- Performing Exploratory Data Analysis---------

#### Seperating numerical and the categorical columns

In [None]:
cat = [x for x in data.columns if data[x].dtype == 'object'] 
print(cat)
num = [x for x in data.columns if x not in cat] 
print(num)

In [None]:
x=data.drop("HeartDisease",axis=1)
y = data['HeartDisease']
y

In [None]:
x='Sex'
xa = data['Sex'].unique()
ya = data['HeartDisease'].values
for i in data['Sex'].unique():
    t=data[data['Sex']==i].count()
    t=t['Sex']
    t1=data[(data['Sex']==i) & (data['HeartDisease']==1)].count()
    t1=t1['Sex']
    print(t1/t*100)
    

## With categorical value types

In [None]:

for x in cat:
    sns.histplot(x=x,data=data, hue='HeartDisease',palette=['y', 'c'],multiple='stack', alpha=0.7, shrink=.7)
    plt.show()
    for i in data[x].unique():
        t=data[data[x]==i].count()
        t=t[x]
        t1=data[(data[x]==i) & (data['HeartDisease']==1)].count()
        t1=t1[x]
        print('percentage of heart failure among feature',i,'=',t1/t*100)
        

<h5>
    <pre>
Sex:- Male has high chances of getting heart disease as compared to females.
ChestPain:- People having ASY type chest pain has high chance of heart Disease while People with the ATA type     chestpain has minimum chance of heart Disease.
ExerciseAngina :- People who has Exercise Angina has high chance of Heart Disease.
ST_Slope:- People with ST_Slop UP has minimum chance of heart Disease
    </pre>
</h5>

## With Numerical value types

In [None]:
for x in num:
    if x =='HeartDisease':
        continue
    sns.kdeplot(data=data, x=x,hue='HeartDisease',palette=['g', 'c'],multiple="stack")
    plt.show()

<h5>
Let us look at the people’s age who are suffering from the disease or not.
</h5>

In [None]:
al = data['Age'].unique().tolist()
ageper=[]
agenum=[]
agepcount= []
for i in al:
    a01=data['Age']==i
    a1=data['HeartDisease']==1
    ac01 =  data[a01].shape[0]
    ac1=data[a01 & a1].shape[0]
    agepcount.append(ac1)
    agenum.append(ac01)
    ageper.append(ac1/ac01*100)

In [None]:
list1 = list(zip(al,agenum,ageper,agepcount))
df1 = pd.DataFrame(list1,columns =['age','totalcount','percentageof1','agepcount'])

In [None]:
df1=df1.sort_values(['age'])
df1

In [None]:
#Total vs youth Unemployment
plt.figure(figsize=(20,8))
plt.bar(df1['age'].values-.2,df1['agepcount'].values,0.4,color='m',label='count of people suffering from the disease.')
plt.bar(df1['age'].values+0.2,df1['totalcount'].values,0.4,color='c',label='total count of observations')
plt.grid(True)
plt.xlabel('Age')
plt.xticks(df1['age'].values)
plt.title('Plot of total count of observations and count of people suffering from the disease')
plt.legend()

We see that most people who are suffering are of the age of 58, followed by 57.
Majorly, people belonging to the age group 50+ are likely to suffer from the disease.

# Correlation

In [None]:
data1 = data.sort_values(['Age'])
data1.head()

In [None]:
corr = data1.corr()
corr

In [None]:
corrheart = corr['HeartDisease']
corrheart =corrheart[:6]
corrheart

In [None]:
corr = data.corr()
sns.heatmap(corr, cmap="Blues", annot=True)

In [None]:
plt.figure(figsize=(10,8))
plt.bar(corrheart.index,corrheart.values,0.2,color='c')
plt.title('Correlations between input columns and target column HeartDisease')

<h3>
<pre>
MaxHR,Oldpeak has some significant correlation to the HeartDisease
with 
MaxHR having correlation coefficient of  -0.400421
and 
Oldpeak having correlation coefficient of  0.403951.
</pre>
    </h3>


<h1 style='background-color: powderblue;'>
    Machine learning
</h1>

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

from sklearn.metrics import accuracy_score, f1_score

data

In [None]:
data

## Data Preprocessing 

In [None]:
data['Sex']=data['Sex'].replace("M", 1)
data['Sex']=data['Sex'].replace("F", 0)
data.head()

In [None]:
cat1= cat[1:]
cat1

In [None]:
for x in cat1:
    data[x]=data[x].astype(str)
number = preprocessing.LabelEncoder()
for x in cat1:
    data[x]=number.fit_transform(data[x])
data.head(10)

## Splittig the Data into Train and Test

In [None]:
X = data.iloc[:,:-1]
X.head()

In [None]:
Y=data.iloc[:,-1:]
Y.head()

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y, random_state = 10,test_size = 0.3)

# Training the Model

### DecisionTreeClassifier

In [None]:
model_dt = DecisionTreeClassifier(random_state=50)
model_dt = model_dt.fit(X_train,Y_train)
model_dt

In [None]:
model_dt=model_dt.score(X_test, Y_test)
print(model_dt)

## k-nearest neighbors

In [None]:
model_knn = KNeighborsClassifier(n_neighbors = 5)
model_knn.fit(X_train, Y_train)

In [None]:
model_knn.score(X_test, Y_test)

In [None]:
klist=[]
for i in range(20):
    model_knn = KNeighborsClassifier(n_neighbors = i+1)
    model_knn.fit(X_train, Y_train)
    klist.append(model_knn.score(X_test, Y_test))
r20=[x+1 for x in range(20) ]

In [None]:
dfplotknn=pd.DataFrame(list(zip(klist,r20)),columns=['accuracy','k-value'])

In [None]:
dfplotknn.head()

### Finding value of K

In [None]:
plt.plot(dfplotknn['k-value'].values,dfplotknn['accuracy'].values,'-yo')
plt.axvspan(8.5,9.5, color='c', alpha=0.3)
plt.axis([0,20,0.5,.8])

In [None]:
maxknn=dfplotknn.iloc[8]
maxknn

##### value of k = 9

In [None]:
model_dt

## Comparing KNN and Decision Tree

In [None]:
parr = np.array([maxknn[0]*100,model_dt*100]) 
label=['KNN','Decision Tree']
plt.pie(parr,labels=label,autopct='%1.2f%%')

# -------------------------------- THE END ---------------------------------------