In [1]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv("heart_disease.csv")

df = df[0:899]

# Retain only the specified columns
columns_to_keep = ['age', 'sex', 'painloc', 'painexer', 'cp', 'trestbps', 
                   'smoke', 'fbs', 'prop', 'nitr', 'pro', 'diuretic', 
                   'thaldur', 'thalach', 'exang', 'oldpeak', 'slope', 'target']

df_subset = df[columns_to_keep]

## Replaces all values for painloc and painexer with the mode value for those columns
df_subset['painloc'] = df_subset['painloc'].fillna(df_subset['painloc'].mode()[0])
df_subset['painexer'] = df_subset['painexer'].fillna(df_subset['painexer'].mode()[0])

## Values < 100 are replaced with 100
df_subset.loc[df_subset['trestbps'] < 100, 'trestbps'] = 100

## Values less than 0 are replaced with 0, those greater than 4 are replaced with 4
df_subset.loc[df_subset['oldpeak'] < 0, 'oldpeak'] = 0
df_subset.loc[df_subset['oldpeak'] > 4, 'oldpeak'] = 4

## Filling missing values with the mean
mean_thaldur = round(df_subset['thaldur'].mean(), 1)
mean_thalach = round(df_subset['thalach'].mean(), 1)

df_subset['thaldur'].fillna(mean_thaldur, inplace=True)
df_subset['thalach'].fillna(mean_thalach, inplace=True)

## Filling missing values with the mode value
mode_fbs = df_subset['fbs'].mode()[0]
mode_prop = df_subset['prop'].mode()[0]
mode_nitr = df_subset['nitr'].mode()[0]
mode_pro = df_subset['pro'].mode()[0]
mode_diuretic = df_subset['diuretic'].mode()[0]
mode_exang = df_subset['exang'].mode()[0]
mode_slope = df_subset['slope'].mode()[0]

df_subset['fbs'].fillna(mode_fbs, inplace=True)
df_subset['prop'].fillna(mode_prop, inplace=True)
df_subset['nitr'].fillna(mode_nitr, inplace=True)
df_subset['pro'].fillna(mode_pro, inplace=True)
df_subset['diuretic'].fillna(mode_diuretic, inplace=True)
df_subset['exang'].fillna(mode_exang, inplace=True)
df_subset['slope'].fillna(mode_slope, inplace=True)

## Also replaces values greater than 1 with the mode for that column
df_subset.loc[df_subset['fbs'] > 1, 'fbs'] = mode_fbs
df_subset.loc[df_subset['prop'] > 1, 'prop'] = mode_prop
df_subset.loc[df_subset['nitr'] > 1, 'nitr'] = mode_nitr
df_subset.loc[df_subset['pro'] > 1, 'pro'] = mode_pro
df_subset.loc[df_subset['diuretic'] > 1, 'diuretic'] = mode_diuretic

## These columns are checked for skewness. If they appear to be skewed, the missing values are filled with
## the median. If not skewed, the missing values are filled with the mean. 
subs_cols = ['trestbps', 'oldpeak', 'thaldur', 'thalach']
df_subs = df_subset[subs_cols]

skewness = df_subs.skew()

for col in df_subs.columns:
    if abs(skewness[col]) < 0.5:
        # If not skewed, replace missing values with mean
        df_subs[col].fillna(round(df_subs[col].mean(), 1), inplace=True)
    else:
        # If skewed, replace missing values with median
        df_subs[col].fillna(df_subs[col].median(), inplace=True)

df_subset[subs_cols] = df_subs

# Save the modified DataFrame to a new CSV file
df_subset.to_csv("heart_disease_subset.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['painloc'] = df_subset['painloc'].fillna(df_subset['painloc'].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['painexer'] = df_subset['painexer'].fillna(df_subset['painexer'].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['thaldur'].fillna(mean_thaldur, inplace=True)
A value is trying to be set on a copy

In [29]:
print(df.shape)

(899, 56)


In [1]:
!pip install scrapy

Collecting scrapy
  Using cached Scrapy-2.11.1-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=18.9.0 (from scrapy)
  Using cached twisted-24.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Using cached cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Using cached itemloaders-1.2.0-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Using cached parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Using cached queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Using cached service_identity-24.1.0-py3-none-any.whl.metadata (4.8 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Using cached w3lib-2.1.2-py3-none-any.whl.metadata (1.1 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Using cached zope.interface-6.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

In [3]:
import os
import requests
from scrapy import Selector
from pathlib import Path
import re
import pandas as pd
from typing import List

DATA_FOLDER = Path('data/')
URL = 'https://www.abs.gov.au/statistics/health/health-conditions-and-risks/smoking/latest-release'

if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

def get_selector_from_url(url:str) -> Selector:
    response = requests.get(url)
    return Selector(text=response.content)

def parse_row(row:Selector) -> List[str]:
    '''
    Parses a html row into a list of individual elements
    '''
    cells = row.xpath('.//th | .//td')
    row_data = []
    
    for cell in cells:
        cell_text = cell.xpath('normalize-space(.)').get()
        cell_text = re.sub(r'<.*?>', ' ', cell_text)  # Remove remaining HTML tags
        # if there are br tags, there will be some binary characters
        cell_text = cell_text.replace('\xa0', '')  # Remove \xa0 characters
        row_data.append(cell_text)
    
    return row_data

def parse_table_as_df(table_sel:Selector,header:bool=True) -> pd.DataFrame:
    '''
    Parses a html table and returns it as a Pandas DataFrame
    '''
    # extract rows
    rows = table_sel.xpath('./tbody//tr')
    
    # parse header and the remaining rows
    columns = None
    start_row = 0
    if header:
        columns = parse_row(rows[0])
        
    table_data = [parse_row(row) for row in rows[start_row:]]
    
    # return data frame
    return pd.DataFrame(table_data,columns=columns)

selector = get_selector_from_url(URL)

# select the table containing smoking data by age
smoking_table = selector.xpath('//table[caption[contains(text(),"Proportion of people 15 years and over who were current daily smokers by age")]]')

if smoking_table:
    try:
        df = parse_table_as_df(smoking_table[0], header=True)
    except Exception as e:
        print(f"Error: {e}")
    else:
        
        # Print the data rows
        for index, row in df.iterrows():
            print(row.iloc[0], row.iloc[10])
else:
    print("Smoking table not found on the webpage.")


15–17 1.6
18–24 7.3
25–34 10.9
35–44 10.9
45–54 13.8
55–64 14.9
65–74 8.7
75 years and over 2.9


In [4]:
import os
import requests
from scrapy import Selector
from pathlib import Path

DATA_FOLDER = Path('data/')
URL = 'https://www.cdc.gov/tobacco/data_statistics/fact_sheets/adult_data/cig_smoking/index.htm'

if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

def get_selector_from_url(url:str) -> Selector:
    response = requests.get(url)
    return Selector(text=response.content)

selector = get_selector_from_url(URL)

# Select the paragraphs containing the desired text
paragraphsM = selector.xpath("//li[contains(text(), 'adult men')]/text()").get()
paragraphsF = selector.xpath("//li[contains(text(), 'adult women')]/text()").get()

print(paragraphsM.strip())
print(paragraphsF.strip())

uls = selector.xpath("//div[h4[contains(text(), 'By Age')]]/following-sibling::div//ul/li/text()").getall()

for line in uls:
    print(line.strip())

About 13 of every 100 adult men (13.1%)
About 10 of every 100 adult women (10.1%)
About 5 of every 100 adults aged 18–24 years (5.3%)
Nearly 13 of every 100 adults aged 25–44 years (12.6%)
Nearly 15 of every 100 adults aged 45–64 years (14.9%)
About 8 of every 100 adults aged 65 years and older (8.3%)


In [2]:

## Source 1 smoke column imputation

def get_smoking_percentage(age: int) -> float:
    # Define the smoking percentage based on the age range
    if 15 <= age <= 17:
        return .016
    elif 18 <= age <= 24:
        return .073
    elif 25 <= age <= 34:
        return .109
    elif 35 <= age <= 44:
        return .109
    elif 45 <= age <= 54:
        return .138
    elif 55 <= age <= 64:
        return .149
    elif 65 <= age <= 74:
        return .087
    elif age >= 75:
        return .029
    else:
        return None  # Return None for unknown age ranges


# Convert the 'age' column to integer type
df_subset['age'] = df_subset['age'].astype(int)

# Apply the function to create a new column 'smoking_src1' with the updated values
df_subset['smoking_src1'] = df_subset.apply(lambda row: row['smoke'] if row['smoke'] in [0, 1] else get_smoking_percentage(row['age']), axis=1)

# Save the modified DataFrame to the same CSV file, overwriting the existing file
df_subset.to_csv('heart_disease_subset.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['age'] = df_subset['age'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['smoking_src1'] = df_subset.apply(lambda row: row['smoke'] if row['smoke'] in [0, 1] else get_smoking_percentage(row['age']), axis=1)


In [3]:

## Source 2 smoke column imputation
def get_smoking_percentage(age: int, sex: int) -> float:
    if sex == 0:  # Female
        if 18 <= age <= 24:
            return .053
        elif 25 <= age <= 44:
            return .126
        elif 45 <= age <= 64:
            return .149
        elif age >= 65:
            return .083
        
    elif sex == 1:
        if 18 <= age <= 24:
            return round(.053 * (.131 / .101), 3)
        elif 25 <= age <= 44:
            return round(.126 * (.131 / .101), 3)
        elif 45 <= age <= 64:
            return round(.149 * (.131 / .101), 3)
        elif age >= 65:
            return round(.083 * (.131 / .101), 3)
        
    return None


df_subset['sex'] = df_subset['sex'].astype(int)

# Apply the function to create a new column 'smoke_src2' with the updated values
df_subset['smoke_src2'] = df_subset.apply(lambda row: row['smoke'] if row['smoke'] in [0, 1] else get_smoking_percentage(row['age'], row['sex']), axis=1)

# Save the modified DataFrame to the same CSV file, overwriting the existing file
df_subset.to_csv('heart_disease_subset.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['sex'] = df_subset['sex'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['smoke_src2'] = df_subset.apply(lambda row: row['smoke'] if row['smoke'] in [0, 1] else get_smoking_percentage(row['age'], row['sex']), axis=1)


In [4]:
# Remove the 'smoke' column
df_subset.drop(columns=['smoke'], inplace=True)

# Save the modified DataFrame to the same CSV file, overwriting the existing file
df_subset.to_csv('heart_disease_subset.csv', index=False)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset.drop(columns=['smoke'], inplace=True)


In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

# Step 1: Split the data into training and test sets
X = df_subset.drop(columns=['target'])  # Features
y = df_subset['target']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

# Step 2: Train binary classification models
# Example models: Logistic Regression, Random Forest, and Support Vector Machine (SVM)
logistic_model = LogisticRegression(max_iter=1000)
random_forest_model = RandomForestClassifier(max_depth=100)
svm_model = SVC(C = 1000)

logistic_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)

# Step 3: Evaluate the performance of the models on the test data
logistic_pred = logistic_model.predict(X_test)
random_forest_pred = random_forest_model.predict(X_test)
svm_pred = svm_model.predict(X_test)

# Evaluate accuracy
logistic_accuracy = accuracy_score(y_test, logistic_pred)
random_forest_accuracy = accuracy_score(y_test, random_forest_pred)
svm_accuracy = accuracy_score(y_test, svm_pred)

# Classification report
logistic_report = classification_report(y_test, logistic_pred)
random_forest_report = classification_report(y_test, random_forest_pred)
svm_report = classification_report(y_test, svm_pred)

# Perform 5-fold cross-validation on logistic regression model
logistic_cv_scores = cross_val_score(logistic_model, X_train, y_train, cv=5)

# Perform 5-fold cross-validation on random forest model
random_forest_cv_scores = cross_val_score(random_forest_model, X_train, y_train, cv=5)

# Perform 5-fold cross-validation on SVM model
svm_cv_scores = cross_val_score(svm_model, X_train, y_train, cv=5)

# Calculate mean cross-validation scores
logistic_mean_cv_score = logistic_cv_scores.mean()
random_forest_mean_cv_score = random_forest_cv_scores.mean()
svm_mean_cv_score = svm_cv_scores.mean()

# Print results
print("Logistic Regression Model Accuracy:", logistic_accuracy)
print("Logistic Regression Model Classification Report:\n", logistic_report)

print("\nRandom Forest Model Accuracy:", random_forest_accuracy)
print("Random Forest Model Classification Report:\n", random_forest_report)

print("\nSVM Model Accuracy:", svm_accuracy)
print("SVM Model Classification Report:\n", svm_report)

# Print mean cross-validation scores
print("Logistic Regression Model Mean Cross-Validation Score:", logistic_mean_cv_score)
print("Random Forest Model Mean Cross-Validation Score:", random_forest_mean_cv_score)
print("SVM Model Mean Cross-Validation Score:", svm_mean_cv_score)



### ANALYSIS
# Based on the results, I would choose the random forest model. Although all three models have similar scores for accuracy, precision, recall, and f1-score, the cross-validation analysis shows that the random forest model is slightly better suited to unseen data compared to the other models. While its accuracy is slightly less than the other models, I would trust the random forest model over the others when it comes to testing on new data.


# Running this code block has made my instance freeze many times. This may be due to the resource allocation for this t2 instance. If it fails to run, this is what the output looks like when run on my local pc:

#Logistic Regression Model Accuracy: 0.7444444444444445
#Logistic Regression Model Classification Report:
#               precision    recall  f1-score   support
#
#         0.0       0.72      0.70      0.71        40
#         1.0       0.76      0.78      0.77        50
#
#    accuracy                           0.74        90
#   macro avg       0.74      0.74      0.74        90
#weighted avg       0.74      0.74      0.74        90
#
#
#Random Forest Model Accuracy: 0.7333333333333333
#Random Forest Model Classification Report:
#               precision    recall  f1-score   support
#
#         0.0       0.70      0.70      0.70        40
#         1.0       0.76      0.76      0.76        50
#
#    accuracy                           0.73        90
#   macro avg       0.73      0.73      0.73        90
#weighted avg       0.73      0.73      0.73        90
#
#
#SVM Model Accuracy: 0.7333333333333333
#SVM Model Classification Report:
#               precision    recall  f1-score   support
#
#         0.0       0.72      0.65      0.68        40
#         1.0       0.74      0.80      0.77        50
#
#    accuracy                           0.73        90
#   macro avg       0.73      0.73      0.73        90
#weighted avg       0.73      0.73      0.73        90
#
#Logistic Regression Model Mean Cross-Validation Score: 0.8059658001686987
#Random Forest Model Mean Cross-Validation Score: 0.8109347442680775
#SVM Model Mean Cross-Validation Score: 0.8034813281190093

NameError: name 'df_subset' is not defined