In [28]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv("heart_disease.csv")

df = df[0:899]

# Retain only the specified columns
columns_to_keep = ['age', 'sex', 'painloc', 'painexer', 'cp', 'trestbps', 
                   'smoke', 'fbs', 'prop', 'nitr', 'pro', 'diuretic', 
                   'thaldur', 'thalach', 'exang', 'oldpeak', 'slope', 'target']

df_subset = df[columns_to_keep]

## Replaces all values for painloc and painexer with the mode value for those columns
df_subset['painloc'] = df_subset['painloc'].fillna(df_subset['painloc'].mode()[0])
df_subset['painexer'] = df_subset['painexer'].fillna(df_subset['painexer'].mode()[0])

df_subset.loc[df_subset['trestbps'] < 100, 'trestbps'] = 100

df_subset.loc[df_subset['oldpeak'] < 0, 'oldpeak'] = 0
df_subset.loc[df_subset['oldpeak'] > 4, 'oldpeak'] = 4

mean_thaldur = round(df_subset['thaldur'].mean(), 1)
mean_thalach = round(df_subset['thalach'].mean(), 1)

df_subset['thaldur'].fillna(mean_thaldur, inplace=True)
df_subset['thalach'].fillna(mean_thalach, inplace=True)


mode_fbs = df_subset['fbs'].mode()[0]
mode_prop = df_subset['prop'].mode()[0]
mode_nitr = df_subset['nitr'].mode()[0]
mode_pro = df_subset['pro'].mode()[0]
mode_diuretic = df_subset['diuretic'].mode()[0]
mode_exang = df_subset['exang'].mode()[0]
mode_slope = df_subset['slope'].mode()[0]

df_subset['fbs'].fillna(mode_fbs, inplace=True)
df_subset['prop'].fillna(mode_prop, inplace=True)
df_subset['nitr'].fillna(mode_nitr, inplace=True)
df_subset['pro'].fillna(mode_pro, inplace=True)
df_subset['diuretic'].fillna(mode_diuretic, inplace=True)
df_subset['exang'].fillna(mode_exang, inplace=True)
df_subset['slope'].fillna(mode_slope, inplace=True)

df_subset.loc[df_subset['fbs'] > 1, 'fbs'] = mode_fbs
df_subset.loc[df_subset['prop'] > 1, 'prop'] = mode_prop
df_subset.loc[df_subset['nitr'] > 1, 'nitr'] = mode_nitr
df_subset.loc[df_subset['pro'] > 1, 'pro'] = mode_pro
df_subset.loc[df_subset['diuretic'] > 1, 'diuretic'] = mode_diuretic


subs_cols = ['trestbps', 'oldpeak', 'thaldur', 'thalach']
df_subs = df_subset[subs_cols]

skewness = df_subs.skew()

for col in df_subs.columns:
    if abs(skewness[col]) < 0.5:
        # If not skewed, replace missing values with mean
        df_subs[col].fillna(round(df_subs[col].mean(), 1), inplace=True)
    else:
        # If skewed, replace missing values with median
        df_subs[col].fillna(df_subs[col].median(), inplace=True)

df_subset[subs_cols] = df_subs

# Save the modified DataFrame to a new CSV file
df_subset.to_csv("heart_disease_subset.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['painloc'] = df_subset['painloc'].fillna(df_subset['painloc'].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['painexer'] = df_subset['painexer'].fillna(df_subset['painexer'].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['thaldur'].fillna(mean_thaldur, inplace=True)
A value is trying to be set on a copy

In [29]:
print(df.shape)

(899, 56)


In [2]:
import os
import requests
from scrapy import Selector
from table_parsing import parse_row, parse_table_as_df
from pathlib import Path

############ SETUP #############
DATA_FOLDER = Path('data/')
URL = 'https://www.abs.gov.au/statistics/health/health-conditions-and-risks/smoking/latest-release'

if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

def get_selector_from_url(url:str) -> Selector:
    response = requests.get(url)
    return Selector(text=response.content)

############# CASE STUDY #############
selector = get_selector_from_url(URL)

# select the table containing smoking data by age
smoking_table = selector.xpath('//table[caption[contains(text(),"Proportion of people 15 years and over who were current daily smokers by age")]]')

if smoking_table:
    try:
        df = parse_table_as_df(smoking_table[0], header=True)
    except Exception as e:
        print(f"Error: {e}")
    else:
        # save as csv
        df.to_csv(DATA_FOLDER/'smoking_by_age.csv', index=False)
else:
    print("Smoking table not found on the webpage.")


ModuleNotFoundError: No module named 'scrapy'