## Task 1: Import Modules

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

## Task 2: Load the Dataset

In [None]:
df = pd.read_csv('/usercode/CVD_cleaned.csv', encoding='ISO-8859-1')
df.head()

## Task 3: Create the Pairplot

In [None]:
g = sns.PairGrid(df)
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)

## Task 4: Plot the Distribution of Categorical Features

In [None]:
categorical_features = df.columns[df.dtypes =='object']
plot_num = 1
plt.figure(figsize=(20,10))
for col in categorical_features:
    ax = plt.subplot(3,4,plot_num)
    sns.histplot(df[col], kde=True)
    plot_num +=1

plt.tight_layout()

## Task 5: Plot the Distribution of Numerical Features

In [None]:
numerical_features = df.columns[df.dtypes !='object']
plot_num = 1
plt.figure(figsize=(20,10))
for col in numerical_features:
    ax = plt.subplot(3,3,plot_num)
    sns.histplot(df[col], kde=True)
    plot_num +=1

plt.tight_layout()

## Task 6: Plot the Relation of Factors with Diseases

In [None]:
selected_variables = ['General_Health', 'Exercise', 'Sex', 'Age_Category', 'Smoking_History', 'Depression']
disease = 'Heart_Disease'
plot_num = 1
plt.figure(figsize=(20,10))
for variable in selected_variables:
    ax = plt.subplot(3,3, plot_num)
    sns.countplot(data=df, x = variable, hue = disease)
    plot_num += 1
plt.tight_layout()

## Task 7: Transform the Categorical Columns

In [None]:
data = df.copy()
categorical_features = df.columns[df.dtypes =='object']
le = LabelEncoder()
for i in categorical_features:
    data[i] = le.fit_transform(data[i])
data.head()

## Task 8: Split the Training and Testing Dataset

In [None]:
X = data.drop('Heart_Disease', axis='columns')
y = data['Heart_Disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

## Task 9: Build the Classifier

In [None]:
classifier = RandomForestClassifier(n_estimators=500)

## Task 10: Train the Classifier

In [None]:
classifier.fit(X_train, y_train)

## Task 11: Get Predictions

In [None]:
predictions = classifier.predict(X_test)
prediction_probability = classifier.predict_proba(X_test)

## Task 12: Print Confusion Matrix and Accuracy

In [None]:
print("Report: \n", classification_report(y_test, predictions))
print('\n')
print("Roc Score: ", roc_auc_score(y_test, prediction_probability[:,1]))