In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [2]:
data = pd.read_csv("Data_cardiovascular_risk.csv")

In [3]:
data = pd.get_dummies(data, columns=['sex','is_smoking']) 
data.drop(['sex_F','is_smoking_NO','education'], axis =1, inplace = True)
data.rename(columns = {'sex_M':'male','is_smoking_YES':'is_smoking'}, inplace =True)
data = data[['age','male','is_smoking','cigsPerDay','prevalentStroke','prevalentHyp','diabetes','totChol','sysBP','BPMeds','diaBP','BMI','glucose','heartRate','TenYearCHD']]

In [4]:
data.fillna(data.median(), inplace = True)
top_features = ['age', 'totChol', 'sysBP', 'diaBP', 'BMI', 'glucose', 'heartRate']

In [5]:
import statsmodels.api as sm
X_top = data[top_features]
y = data['TenYearCHD']
res = sm.Logit(y,X_top).fit()
params = res.params
conf = res.conf_int()
conf['Odds Ratio'] = params
conf.columns = ['5%', '95%', 'Odds Ratio']

Optimization terminated successfully.
         Current function value: 0.412827
         Iterations 6


In [6]:
X = data[top_features]
y = data.iloc[:,-1]

In [7]:
!pip install imblearn



In [8]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

In [9]:
over = SMOTE(sampling_strategy=.9)
under = RandomUnderSampler(sampling_strategy=.9)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_smote, y_smote = pipeline.fit_resample(X, y)

In [10]:
new_data = pd.concat([pd.DataFrame(X_smote), pd.DataFrame(y_smote)], axis=1)
new_data.columns = ['age', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose','TenYearCHD']

In [11]:
X_new = new_data[top_features]
y_new= new_data.iloc[:,-1]

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_new,y_new,test_size=.2,random_state=42)


In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train_scaled)

X_test_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(X_test_scaled)

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score,precision_score,classification_report,roc_auc_score,roc_curve

In [15]:
from sklearn.svm import SVC

In [16]:
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
svm_clf = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, cv=10)

In [17]:
svm_clf.fit(X_train,y_train)

In [18]:
# saving model in pickle file.
pickle.dump(svm_clf, open('model.pkl', 'wb'))

In [19]:
Heart_disease_detector_model = pickle.load(open('model.pkl', 'rb'))