In [39]:
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score,fbeta_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.svm import SVC
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn

#  **Reading data from CSV file**

### In first step we start by loading dataset :

In [40]:
df = pd.read_csv("heart_disease_health_indicators_BRFSS2015.csv")
df

# Analysis & preprocessing

### Let's analysis dataset :

In [41]:
df.columns

In [42]:
df.describe()

In [43]:
df.info()

In [44]:
df.isnull().any()

In [45]:
df['Age'] = df['Age']*4
df

# Visualization

# **Age**

### As we can see 25-45 years old patients are much more in danger of heart attack.

In [46]:
sns.distplot(df['Age'], kde=False, color='red')

## Sex

### Sex = 0 is much more in danger

In [47]:
sns.countplot(x='Sex', data=df, hue='Smoker')

## **Smoking / HDA**

### non-smokers are in DANGER !!!!!!!

In [48]:
sns.countplot(x='Smoker', data=df, hue='HeartDiseaseorAttack')

## **Smoking / Diabetes**

In [49]:
sns.countplot(x='Smoker', data=df, hue='Diabetes')

## **Smoking / Stroke**

In [50]:
sns.countplot(x='Smoker', data=df, hue='Stroke')

# **Diabetes / Sex**

In [51]:
sns.countplot(x='Diabetes', data=df, hue='Sex')

## **Physical Activity**
#### Here we gonna see how physical activity is important to be good and avoid lot of things such as Diabetes and HeartDiseaseorAttack and etc ....

In [52]:
sns.countplot(x='PhysActivity', data=df, hue='Sex')

# **Physical / HDA**

In [53]:
sns.countplot(x='PhysActivity', data=df, hue='HeartDiseaseorAttack')

# **Physical / Diabetes**

In [54]:
sns.countplot(x='PhysActivity', data=df, hue='Diabetes')

## **Education Level**

In [55]:
sns.distplot(df['Education'], kde=False, color='Blue')

## **Income**

In [56]:
sns.distplot(df['Income'], kde=False, color='Green')

## **Fruits and Veggies** 

#### Showing how many people eating at least 1 fruit & Veggies per day and if this protect them from Diseases

In [57]:
sns.countplot(x='Fruits', data=df, hue='Diabetes')

In [58]:
sns.countplot(x='Fruits', data=df, hue='HeartDiseaseorAttack')

In [59]:
sns.countplot(x='Veggies', data=df, hue='Sex')

In [60]:
sns.countplot(x='Veggies', data=df, hue='Diabetes')

## **BMI**

In [61]:
sns.distplot(df['BMI'], kde=False, color ='Yellow')


### we here seeing the relation between Body mass and heart Diseases and if body mass can affect on person with making it possible to have heart Diseases or any other Diseases or not

In [62]:
sns.distplot(df['BMI'], kde=False,color='Yellow')

In [63]:
sns.countplot(x='BMI', data=df, hue='HeartDiseaseorAttack')

In [64]:
sns.countplot(x='BMI', data=df, hue='Diabetes')

### we see here how healthcare is so important and how it affect on person and make him good more than people who don't have

In [65]:
sns.countplot(x='AnyHealthcare', data=df, hue='HeartDiseaseorAttack')

In [66]:
sns.countplot(x='AnyHealthcare', data=df, hue='Diabetes')

In [67]:
sns.countplot(x='AnyHealthcare', data=df, hue='Stroke')

# Spliting data

In [68]:
x=df.drop(['HeartDiseaseorAttack', 'Education', 'Income'], axis=1)
y=df['HeartDiseaseorAttack']

In [69]:
x

In [70]:
y

In [71]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
y_train.value_counts()

### **We gonna use SMOTE for sampling,in order to learn about SMOTE you can find this link useful**
- https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/


In [72]:
sampler=SMOTE()
x_train,y_train=sampler.fit_resample(x_train,y_train)
y_train.value_counts()

# Scaling & predicting

In [73]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
x_train

In [74]:
models={
    'LR':LogisticRegression(),
    #'KNN':KNeighborsClassifier(),
    'DT':DecisionTreeClassifier(),
    'RT':RandomForestClassifier(),
    'NB':GaussianNB()
}

In [75]:
for name,model in  models.items():
    print(f'using {name}: ')
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(f'Training Accuracy :{accuracy_score(y_train,model.predict(x_train))}')
    print(f'Testing Accuracy :{accuracy_score(y_test,y_pred)}')
    print(f'Confusion matrix:\n {confusion_matrix(y_test,y_pred)}')
    print(f'Recall: {recall_score(y_test,y_pred)}')
    print(f'precision: {precision_score(y_test,y_pred)}')
    print(f'F1-score: {f1_score(y_test,y_pred)}')
    print(f'Fbeta-score: {fbeta_score(y_test,y_pred,beta=0.5)}')
    print(classification_report(y_test,y_pred))
    print('-'*33)