In [None]:
import sqlite3
import pandas as pd
import os

# Create a directory for data
db_directory = "heart_data"

# Check we have a directory called "heart_data" (if not create one)
os.makedirs(db_directory, exist_ok=True)

# Path to SQLite DB file
db_path = os.path.join(db_directory, 'heart_disease.db') 

# Read CSV file, data will not be read correctly without ";" delimiter
#csv_file_path = r'C:\Users\Phillip\Desktop\ITDAA4-12 Project DV38MC178\heart.csv' (laptop location)
csv_file_path = r'C:\Users\User\Desktop\ITDAA4-12 Project DV38MC178\heart.csv'

df = pd.read_csv(csv_file_path, delimiter=';')  

# Show the first few rows to verify a correct reading
print("DataFrame structure after reading CSV:")
print(df.head())
print("DataFrame columns and data types after reading CSV:")
print(df.dtypes)

# Create and connect to the DB
conn = sqlite3.connect(db_path)

# Write the correctly read data frame to the SQLite DB
df.to_sql('heart_disease', conn, if_exists='replace', index=False)

# Check data in the database is formatted properly  
df_sql = pd.read_sql('SELECT * FROM heart_disease', conn)
print("Data read from the database:")
print(df_sql.head())

# Close the connection to the DB
conn.close()


In [None]:
import sqlite3
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Connect to the SQLite database
conn = sqlite3.connect('heart_data/heart_disease.db')

# Read the data from the database into a DataFrame
df = pd.read_sql('SELECT * FROM heart_disease', conn)

# Show original values to compare
print("Original values:")
print(df[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']].head())

# Find potential missing values in each column
print("Checking for missing values in each column:")
print(df.isnull().sum())

# Fill missing values with mean of each column (if there are missing values)
print("Filling missing values with the mean (if any).")
df.fillna(df.mean(), inplace=True)

# Encode categorical variables using one-hot encoding
print("Applying one-hot encoding to categorical variables.")
print(df[['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']].head())
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
df = pd.get_dummies(df, columns=categorical_vars)

# Normalize numeric variables in range of 0 to 1
print("Normalizing numeric variables:")
scaler = MinMaxScaler()
numeric_vars = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
df[numeric_vars] = scaler.fit_transform(df[numeric_vars])

# Show the first few rows of the cleaned and preprocessed data frame
print("Displaying the first few rows of the preprocessed data frame:")
print(df.head())

# Close the connection to the database
conn.close()


In [None]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt 

# Connect to the DB
conn = sqlite3.connect("heart_data/heart_disease.db")

# Read the data into the data frame 
df = pd.read_sql('SELECT * FROM heart_disease', conn)

# List of the categorical variables  
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'] 

# Create bar chart for individual categories by looping through the vars (categories) 
for var in categorical_vars:
    counts = df[var].value_counts()
    plt.figure(figsize=(10,5)) 
    plt.bar(counts.index, counts.values, color ='green') 
    plt.xlabel(var)
    plt.ylabel('Frequency') 
    plt.title(f'Frequency of {var}' ) 
    plt.show() 

# Close the db connnection 
conn.close()

In [None]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt

# Connect to the SQLite database
conn = sqlite3.connect('heart_data/heart_disease.db')

# Read the data into the DataFrame
df = pd.read_sql('SELECT * FROM heart_disease', conn)

# List of categorical variables
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

#Check exang is just 1 or 0 
#print("Unique values in 'exang' : ", df['exang'].unique())

# Plot the distribution of classes for each categorical variable based on the target variable
for var in categorical_vars:
    counts = df.groupby(var)['target'].value_counts().unstack()
    counts.plot(kind='bar', stacked=True, figsize=(10, 5), color=['red', 'green'])
    plt.xlabel(var)
    plt.ylabel('Number of people in category')
    plt.title(f'Distribution of {var} based on target variable')
    plt.xticks(rotation=0)
    plt.legend(title='target', loc='upper right')
    plt.show()

# Close the database connection
conn.close()


In [None]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt

# Connect to the SQLite database
conn = sqlite3.connect('heart_data/heart_disease.db')

# Reads the data into the DataFrame
df = pd.read_sql('SELECT * FROM heart_disease', conn)

# Creates a fixed array of numeric variables 
numeric_vars = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Used for looping through the numerical variables for the graph  
for var in numeric_vars:
    plt.figure(figsize=(10, 5))
    
    # Plot histogram for target = 0 (no heart disease)
    #alpha represents the level of transparency 
    # bin represents the number of intervals (level of detail the histogram will show) 
    plt.hist(df[df['target'] == 0][var], bins=15, alpha=0.5, label='No Heart Disease', color='red')
    
    # Plot histogram for target = 1 (heart disease)
    plt.hist(df[df['target'] == 1][var], bins=15, alpha=0.5, label='Heart Disease', color='green')
    
    plt.xlabel(var)  # Names the x-axis with the related variable name 
    plt.ylabel('Frequency')  # Names the y-axis 'frequency' for how often it occurs
    plt.title(f'Distribution of {var} based on target variable')  # Names the title using related variable being graphed
    plt.legend()  # creates a key or legend to increase ease of legibility 
    plt.show()  # outputs the graph 

# Close the database connection
conn.close()


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)


In [None]:
#Creates function to find outliers using interquartile range (IQR) method
def identify_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Finds outliers in numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for feature in numerical_features:
    outliers = identify_outliers(df, feature)
    print(f"Outliers in {feature}:")
    print(outliers)


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

#Creates function to cap outliers using IQT method 
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

#apply caping method to required columbs  
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for feature in numerical_features:
    cap_outliers(df, feature)

# Standardizing numerical features after capping outliers
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Encoding categorical variables
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
df = pd.get_dummies(df, columns=categorical_vars, drop_first=True)

# Check data has been processed correctly 
print(df.head())


In [None]:
import sqlite3
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
import numpy as np

# Read the CSV file and load it into a DataFrame
db_directory = "heart_data"
os.makedirs(db_directory, exist_ok=True)
db_path = os.path.join(db_directory, 'heart_disease.db')
#csv_file_path = r'C:\Users\Phillip\Desktop\ITDAA4-12 Project DV38MC178\heart.csv' for laptop 
csv_file_path = r'C:\Users\User\Desktop\ITDAA4-12 Project DV38MC178\heart.csv'

df = pd.read_csv(csv_file_path, delimiter=';')

print("DataFrame structure after reading CSV:")
print(df.head())
print("DataFrame columns and data types after reading CSV:")
print(df.dtypes)

# Check for missing values and handle them if needed
missing_values = df.isnull().sum()
print("Checking for missing values in each column:")
print(missing_values)

# Find and cap outliers
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for feature in numerical_features:
    df = cap_outliers(df, feature)

# Standardize  numerical features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Encode categorical variables
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
df = pd.get_dummies(df, columns=categorical_vars, drop_first=True)

print("Check the data has been processed correctly")
print(df.head())

#Split  data into training and testing sets
# 'X' contains all features,  'y' is the target variable
X = df.drop(columns='target')
y = df['target']

# The test size splits the data into 20% test data and 80% training data
#Random state ensures that same results can be replicated
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
# Initialize the Logistic Regression model
# Max_iter 35 to ensure that there will be convergence without additional computational strain
model = LogisticRegression(max_iter=75)  

# Fit the model on training data
model.fit(X_train, y_train)

# Test model
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Create classification report
class_report = classification_report(y_test, y_pred)

# Output the evaluation metrics
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

# Save the model to disk
# Set the location that the model can  be saved 
model_path = os.path.join(db_directory, 'heart_disease_model.pkl')

# Save model jus trained
joblib.dump(model, model_path)

# Check that the model has been saved 
print(f'Model saved to {model_path}')


In [None]:
# Import needed libraries
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# Directory setup for data
db_directory = "heart_data"
os.makedirs(db_directory, exist_ok=True)

# Path to CSV file 
csv_file_path = r'C:\Users\User\Desktop\ITDAA4-12 Project DV38MC178\heart.csv'

# Read the CSV file
df = pd.read_csv(csv_file_path, delimiter=';')

# Display the first few rows of the dataframe
print("DataFrame structure after reading CSV:")
print(df.head())
print("DataFrame columns and data types after reading CSV:")
print(df.dtypes)

# Check for missing values
missing_values = df.isnull().sum()
print("Checking for missing values in each column:")
print(missing_values)

# Identify outliers using IQR method
def identify_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Find outliers in numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for feature in numerical_features:
    outliers = identify_outliers(df, feature)
    print(f"Outliers in {feature}:")
    print(outliers)

# Cap outliers using IQR method
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# Apply capping method to required columns
for feature in numerical_features:
    df = cap_outliers(df, feature)

# Standardize numerical features after capping outliers
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Encoding categorical variables
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
df = pd.get_dummies(df, columns=categorical_vars, drop_first=True)

# Check the processed data
print("Check the data has been processed correctly:")
print(df.head())

# Split data into training and testing sets
X = df.drop(columns='target')
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train decision tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Evaluate the model performance
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

# Save decision model to disk
model_path = os.path.join(db_directory, 'heart_disease_dt_model.pkl')
joblib.dump(dt_model, model_path)
print(f'Model saved to {model_path}')


In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# Directory setup for data
db_directory = "heart_data"
os.makedirs(db_directory, exist_ok=True)

# Path to CSV file 
csv_file_path = r'C:\Users\User\Desktop\ITDAA4-12 Project DV38MC178\heart.csv'

# Read the CSV file
df = pd.read_csv(csv_file_path, delimiter=';')

# Display the first few rows of the dataframe
print("DataFrame structure after reading CSV:")
print(df.head())
print("DataFrame columns and data types after reading CSV:")
print(df.dtypes)

# Check for missing values
missing_values = df.isnull().sum()
print("Checking for missing values in each column:")
print(missing_values)

# Identify outliers using IQR method
def identify_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Find outliers in numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for feature in numerical_features:
    outliers = identify_outliers(df, feature)
    print(f"Outliers in {feature}:")
    print(outliers)

# Cap outliers using IQR method
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# Apply capping method to required columns
for feature in numerical_features:
    df = cap_outliers(df, feature)

# Standardize numerical features after capping outliers
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Encoding categorical variables
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
df = pd.get_dummies(df, columns=categorical_vars, drop_first=True)

# Check the processed data
print("Check the data has been processed correctly:")
print(df.head())

# Split data into training and testing sets
X = df.drop(columns='target')
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Test model and evaluate 
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

# Save the model to disk
model_path = os.path.join(db_directory, 'heart_disease_rf_model.pkl')
joblib.dump(rf_model, model_path)
print(f'Model saved to {model_path}')


DataFrame structure after reading CSV:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  
DataFrame columns and data types after reading CSV:
age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: obj