In [None]:
#Heart Disease Prediction using Logistic Regression
#The classification goal is to predict whether the patient has 10-year risk of future coronary heart disease (CHD)

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix
import matplotlib.mlab as mlab
%matplotlib inline

In [None]:
#IT19028774 - Athapaththu P.N.P.

# Loading Heart Data from framingham.csv
chd_df=pd.read_csv("framingham.csv")
#drop education column
chd_df.drop(['education'],axis=1,inplace=True)
chd_df.head()

In [None]:
# data shape
chd_df.shape

# data types
chd_df.dtypes

In [None]:
#Rename 'male' column name
chd_df.rename(columns={'male':'sex_male'},inplace=True)

In [None]:
#check for dupicate columns
duplicate_df = chd_df[chd_df.duplicated()]
duplicate_df

In [None]:
#Find missing values
chd_df.isnull().sum()

In [None]:
#Counting total no of rows with missing values
count=0
for i in chd_df.isnull().sum(axis=1):
    if i>0:
        count=count+1
print('Total number of rows with missing values =', count)
print('Percentage of rows with missing values in the dataset =',round((count/len(chd_df.index))*100),'%')
print('Therefore, the missing values are eliminated.')

In [None]:
#dropping the missing values
chd_df.dropna(axis=0,inplace=True)

In [None]:
#Drawing heatmap for dataset
plt.figure(figsize=(10,8))
sn.heatmap(chd_df.corr(),annot=True,cmap='coolwarm',fmt='.2f',linewidths=2)

In [None]:
#Exploratory Analysis by drawing histograms for CHD features
def draw_chd_histograms(dataframe, features, rows, cols):
    fig_chd=plt.figure(figsize=(20,20))
    for i, feature in enumerate(features):
        ax_chd=fig_chd.add_subplot(rows,cols,i+1)
        dataframe[feature].hist(bins=20,ax=ax_chd,facecolor='maroon')
        ax_chd.set_title(feature+" Visualization",color='navy')
        
    fig_chd.tight_layout()  
    plt.show()
#Call the histogram function
draw_chd_histograms(chd_df,chd_df.columns,6,3)

In [None]:
#TenYearCHD feature values counting
chd_df.TenYearCHD.value_counts()

In [None]:
#Plot a graph for the TenYearCHD feature value data 
sn.countplot(x='TenYearCHD',data=chd_df)

In [None]:
print('Therefore, there are',(chd_df.TenYearCHD == 1).sum(),'patients with risk of heart disease and',(chd_df.TenYearCHD == 0).sum(),'patents with no heart disease.')

In [None]:
# Plot graphs for all feature data in the dataframe
sn.pairplot(data=chd_df)

In [None]:
#Description of the all feature data in the dataframe
#count - no of non-empty values
#mean - average (mean) value
#std - standard deviation
#min - minimum value
#25% - 25% percentile
#50% - 50% percentile
#75% - 75% percentile
#max - maximum value
chd_df.describe()

In [None]:
#IT19140476 - Gamitha Manawadu
sn.distplot(chd_df['age'],color='Yellow',hist_kws={'alpha':1,"linewidth": 2}, kde_kws={"color": "k", "lw": 3, "label": "KDE"})
#most of people are form age 40-50 

In [None]:
#IT19140476 - Gamitha Manawadu
fig,ax=plt.subplots(figsize=(24,6))

plt.subplot(1, 3, 1)
age_bins = [20,30,40,50,60,70,80]
chd_df['bin_age']=pd.cut(chd_df['age'], bins=age_bins)
g1=sn.countplot(x='bin_age',data=chd_df ,hue='TenYearCHD',palette='plasma',linewidth=3)
g1.set_title("Age vs Heart Disease")
#The number of people with heart disease are more from the age 41-55
#Also most of the people fear heart disease and go for a checkup from age 55-65 and dont have heart disease (Precautions)

plt.subplot(1, 3, 2)
chol_bins = [100,150,200,250,300,350,400,450]
chd_df['bin_chol']=pd.cut(chd_df['totChol'], bins=chol_bins)
g2=sn.countplot(x='bin_chol',data=chd_df,hue='TenYearCHD',palette='plasma',linewidth=3)
g2.set_title("Cholestoral vs Heart Disease")
#Most people get the heart disease with 200-250 cholestrol 
#The others with cholestrol of above 250 tend to think they have heart disease but the rate of heart disease falls

plt.subplot(1, 3, 3)
hr_bins = [40,60,80,100,120,140]
chd_df['bin_hr']=pd.cut(chd_df['heartRate'], bins=hr_bins)
g3=sn.countplot(x='bin_hr',data=chd_df,hue='TenYearCHD',palette='plasma',linewidth=3)
g3.set_title("Heart Rate vs Heart Disease")
#People who have thalach between 140-180 have a very high chance of getting the heart disease 

In [None]:
#IT19123578 - Sanduni Jayamali Gamage
fig,ax=plt.subplots(figsize=(24,6))

plt.subplot(1, 3, 1)
dbp_bins = [40,60,80,100,120,140,160]
chd_df['bin_dbp']=pd.cut(chd_df['diaBP'], bins=dbp_bins)
g1=sn.countplot(x='bin_dbp',data=chd_df ,hue='TenYearCHD',palette='plasma',linewidth=3)
g1.set_title("Diastolic Blood Pressure vs Heart Disease")
#People who have Continuous Diastolic Blood Pressure level from 60-100 have a very high chance of getting the heart disease.

plt.subplot(1, 3, 2)
sbp_bins = [80,100,120,140,160,180,200,220,240,260]
chd_df['bin_sbp']=pd.cut(chd_df['sysBP'], bins=sbp_bins)
g1=sn.countplot(x='bin_sbp',data=chd_df ,hue='TenYearCHD',palette='plasma',linewidth=3)
g1.set_title("Systolic Blood Pressure vs Heart Disease")
#People who have Continuous Systolic Blood Pressure level from 100-140 have a very high chance of getting the heart disease.

plt.subplot(1, 3, 3)
cpd_bins = [0,10,20,30,40,50,60,70]
chd_df['bin_cpd']=pd.cut(chd_df['cigsPerDay'], bins=cpd_bins)
g1=sn.countplot(x='bin_cpd',data=chd_df ,hue='TenYearCHD',palette='plasma',linewidth=3)
g1.set_title("Average Number of Cigarettes Smoked Per Day vs Heart Disease")
#People who smokes 0-20 no of cigarettes per day ave a very high chance of getting the heart disease.

In [None]:
#IT19140476 - Gamitha Manawadu
fig,ax=plt.subplots(figsize=(24,6))

plt.subplot(1, 2, 1)
bmi_bins = [0,10,20,30,40,50]
chd_df['bmi']=pd.cut(chd_df['BMI'], bins=bmi_bins)
x1=sn.countplot(x='bmi',data=chd_df,hue='TenYearCHD',palette='spring',linewidth=3)
x1.set_title('BMI vs Heart Disease')
#People with BMI value between 20-30 have highest chance of heart disease

In [None]:
#IT19123578 - Sanduni Jayamali Gamage
fig,ax=plt.subplots(figsize=(16,6))
plt.subplot(121)
s1=sn.boxenplot(x='sex_male',y='age',hue='TenYearCHD',data=chd_df,palette='YlGn',linewidth=3)
s1.set_title("Figure 1")
#Figure 1 shows  most of females having heart disease range from 40-50yrs and men from 40-60yrs

plt.subplot(122)
s2=sn.pointplot(x='sex_male',y='age',hue='TenYearCHD',data=chd_df,palette='autumn',capsize=.2)
s2.set_title("Figure 2")
#Figure 2 shows  mean age for female with heart disease around 54yrs and for males around 51yrs

In [None]:
fig,ax=plt.subplots(figsize=(16,6))
sn.pointplot(x='age',y='totChol',data=chd_df,color='Lime',hue='TenYearCHD',linestyles=["-", "--"])
plt.title('Age vs  Total Cholesterol Level ')
#People with high Cholesterol tend to have heart disease

In [None]:
fig,ax=plt.subplots(figsize=(16,6))
sn.pointplot(x='age',y='cigsPerDay',data=chd_df,color='Lime',hue='TenYearCHD',linestyles=["-", "--"])
plt.title('Age vs  Average Number of Cigarettes Smoked Per Day ')
#According to the Number of Cigarettes Smoked Per Day tend to have heart disease

In [None]:

fig,ax=plt.subplots(figsize=(16,6))
sn.pointplot(x='age',y='diaBP',data=chd_df,color='Lime',hue='TenYearCHD',linestyles=["-", "--"])
plt.title('Age vs Diastolic Blood Pressure')
#People with heart disease tend to have higher Diastolic Blood Pressure

In [None]:
# IT19138732 - Logistic Regression

In [None]:
#Danuka - IT19138732
fig,ax=plt.subplots(figsize=(16,6))
sn.pointplot(x='age',y='sysBP',data=chd_df,color='Lime',hue='TenYearCHD',linestyles=["-", "--"])
plt.title('Age vs Systolic Blood Pressure')
#People with heart disease tend to have higher

In [None]:
#Danuka - IT19138732
fig,ax=plt.subplots(figsize=(16,6))
sn.lineplot(y='heartRate',x='age',data=chd_df,hue="TenYearCHD",style='TenYearCHD',palette='magma',markers=True, dashes=False,err_style="bars", ci=68)
plt.title('Age vs Continuous Heart Rate')

In [None]:
chd_df.dropna(axis=0,inplace=True)

In [None]:
from statsmodels.tools import add_constant as add_constant
heart_details_constant = add_constant(chd_df)
heart_details_constant.head()

In [None]:
pip install scikit-learn

In [None]:
st.chisqprob = lambda chisq, df: st.chi2.sf(chisq, df)
col=heart_details_constant.columns[:-1]
hd_model=sm.Logit(chd_df.TenYearCHD,heart_details_constant[col])
lr_result=hd_model.fit()
lr_result.summary()

In [None]:
#feature Selection
def backward_elemination (hd_frame,dependent_var,column_list):
    while len(column_list)>0 :
        model=sm.Logit(dependent_var,hd_frame[column_list])
        result=model.fit(disp=0)
        largest_pvalue=round(result.pvalues,3).nlargest(1)
        if largest_pvalue[0]<(0.05):
            return result
            break
        else:
            column_list=column_list.drop(largest_pvalue.index)

result=backward_elemination(heart_details_constant,chd_df.TenYearCHD,col)

In [None]:
result.summary()

In [None]:
hd_params = np.exp(result.params)
conf_intervals = np.exp(result.conf_int())
conf_intervals['OR'] = hd_params
pvalue=round(result.pvalues,3)
conf_intervals['pvalue']=pvalue
conf_intervals.columns = ['CI 95%(2.5%)', 'CI 95%(97.5%)', 'Odds Ratio','pvalue']
print ((conf_intervals))

In [None]:
import sklearn
new_hd_features=chd_df[['age','male','cigsPerDay','totChol','sysBP','glucose','TenYearCHD']]
x=new_hd_features.iloc[:,:-1]
y=new_hd_features.iloc[:,-1]
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.20,random_state=5)
print(x_train,x_test,y_train,y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_reg=LogisticRegression()
logistic_reg.fit(x_train,y_train)
y_prediction=logistic_reg.predict(x_test)
print(y_prediction)

In [None]:
plt.scatter(y_test, y_prediction)

In [None]:
plt.hist(y_test - y_prediction)

In [None]:
#sanduni-IT19123578
sklearn.metrics.accuracy_score(y_test,y_prediction)

In [None]:
 #sanduni-IT19123578
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_prediction)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sn.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")