In [1]:
import os
import pandas as pd

In [2]:
# Define the directory structure
project_folder = "project4/proj4_Eric"
data_folder = "data/raw data"

# List of file names
file_names = [
    'processed.cleveland.data',
    'processed.hungarian.data',
    'processed.switzerland.data',
    'processed.va.data',
]


# Construct the base path using the current script's directory
base_path = os.path.join(
    r"C:\Users\ric pc\Desktop\Boot Camp Git\GitHub\GitHub Portfolio", 
    project_folder, 
    data_folder
)

# Now, you can access your files using the base_path
for file_name in file_names:
    file_path = os.path.join(base_path, file_name)
    # Use file_path for your operations
    print(file_path)

C:\Users\ric pc\Desktop\Boot Camp Git\GitHub\GitHub Portfolio\project4/proj4_Eric\data/raw data\processed.cleveland.data
C:\Users\ric pc\Desktop\Boot Camp Git\GitHub\GitHub Portfolio\project4/proj4_Eric\data/raw data\processed.hungarian.data
C:\Users\ric pc\Desktop\Boot Camp Git\GitHub\GitHub Portfolio\project4/proj4_Eric\data/raw data\processed.switzerland.data
C:\Users\ric pc\Desktop\Boot Camp Git\GitHub\GitHub Portfolio\project4/proj4_Eric\data/raw data\processed.va.data


In [3]:
data_frames = {}  # Dictionary to store dataframes

for file_name in file_names:
    try:
        file_path = os.path.join(base_path, file_name)
        df = pd.read_csv(file_path, sep=',', header=None, encoding='ISO-8859-1')
        
        # Remove columns 'NumMajorVessels' and 'Thal'
        df.drop(columns=[11, 12], inplace=True)
        
        # Handle missing values ('?') here if necessary
        df.replace('?', pd.NA, inplace=True)
        
        # Define appropriate data types for each column
        data_types = {
            0: 'int64',   # Age
            1: 'int64',     # Sex
            2: 'int64',     # ChestPainType
            3: 'int64',   # RestingBP
            4: 'int64',   # Cholesterol
            5: 'int64',     # FastingBS
            6: 'int64',     # RestingECG
            7: 'int64',   # MaxHR
            8: 'int64',     # ExerciseAngina
            9: 'float64',   # Oldpeak
            10: 'int64',  # ST_Slope
            13: 'int64'     # HeartDisease
        }
        
        # Convert columns to the defined data types
        for col, dtype in data_types.items():
            df[col] = df[col].astype(dtype, errors='ignore')  # Convert, ignore errors
        
        # Store the dataframe under a variable based on the file name
        variable_name = file_name.split('.')[1]  # Remove file extension
        data_frames[variable_name] = df
        
        print(f"Processed and stored data for {file_name} under {variable_name} variable.")
    except Exception as e:
        print(f"Could not read {os.path.basename(file_name)}. Error: {e}")


Processed and stored data for processed.cleveland.data under cleveland variable.
Processed and stored data for processed.hungarian.data under hungarian variable.
Processed and stored data for processed.switzerland.data under switzerland variable.
Processed and stored data for processed.va.data under va variable.


In [4]:
for variable_name, df in data_frames.items():
    print(f"Dataframe: {variable_name}\n")
    print(df.head())
    print("\n-------------------------------------------\n")

Dataframe: cleveland

   0   1   2    3    4   5   6    7   8    9   10  13
0  63   1   1  145  233   1   2  150   0  2.3   3   0
1  67   1   4  160  286   0   2  108   1  1.5   2   2
2  67   1   4  120  229   0   2  129   1  2.6   2   1
3  37   1   3  130  250   0   0  187   0  3.5   3   0
4  41   0   2  130  204   0   2  172   0  1.4   1   0

-------------------------------------------

Dataframe: hungarian

   0   1   2    3     4  5  6    7  8    9     10  13
0  28   1   2  130   132  0  2  185  0  0.0  <NA>   0
1  29   1   2  120   243  0  0  160  0  0.0  <NA>   0
2  29   1   2  140  <NA>  0  0  170  0  0.0  <NA>   0
3  30   0   1  170   237  0  1  170  0  0.0  <NA>   0
4  31   0   2  100   219  0  1  150  0  0.0  <NA>   0

-------------------------------------------

Dataframe: switzerland

   0   1   2     3   4     5     6    7  8     9     10  13
0  32   1   1    95   0  <NA>     0  127  0    .7     1   1
1  34   1   4   115   0  <NA>  <NA>  154  0    .2     1   1
2  35   1   

In [5]:
# Combine all dataframes into one main dataframe
cleaned_df = pd.concat(data_frames.values(), ignore_index=True)
main_df = cleaned_df.copy()

In [6]:
# Define a dictionary to map column names based on the comments in data_types
column_names_mapping = {
    0: 'Age',
    1: 'Sex',
    2: 'ChestPainType',
    3: 'RestingBP',
    4: 'Cholesterol',
    5: 'FastingBS',
    6: 'RestingECG',
    7: 'MaxHR',
    8: 'ExerciseAngina',
    9: 'Oldpeak',
    10: 'ST_Slope',
    13: 'HeartDisease'
}

# Rename columns in main_df using the mapping
main_df.rename(columns=column_names_mapping, inplace=True)

main_df['HeartDisease'] = main_df['HeartDisease'].apply(lambda x: 1 if x != 0 else 0)

heart_data = main_df.copy()


# Converting numeric values to categorical values
heart_data['Sex'] = heart_data['Sex'].map({1: 'Male', 0: 'Female'})
heart_data['ExerciseAngina'] = heart_data['ExerciseAngina'].map({1: 'Yes', 0: 'No'})
heart_data['ChestPainType'] = heart_data['ChestPainType'].map({1: 'Typical Angina', 
                                                               2: 'Atypical Angina', 
                                                               3: 'Non-Anginal Pain', 
                                                               4: 'Asymptomatic'})
heart_data['RestingECG'] = heart_data['RestingECG'].map({0: 'Normal', 
                                                         1: 'ST-T Wave Abnormality', 
                                                         2: 'Left Ventricular Hypertrophy'})
heart_data['ST_Slope'] = heart_data['ST_Slope'].map({1: 'Upsloping', 
                                                     2: 'Flat', 
                                                     3: 'Downsloping'})

In [7]:
heart_data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,63,Male,Typical Angina,145,233,1,Left Ventricular Hypertrophy,150,No,2.3,Downsloping,0
1,67,Male,Asymptomatic,160,286,0,Left Ventricular Hypertrophy,108,Yes,1.5,Flat,1
2,67,Male,Asymptomatic,120,229,0,Left Ventricular Hypertrophy,129,Yes,2.6,Flat,1
3,37,Male,Non-Anginal Pain,130,250,0,Normal,187,No,3.5,Downsloping,0
4,41,Female,Atypical Angina,130,204,0,Left Ventricular Hypertrophy,172,No,1.4,Upsloping,0
...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,Asymptomatic,127,333,1,ST-T Wave Abnormality,154,,0,,1
916,62,Male,Typical Angina,,139,0,ST-T Wave Abnormality,,,,,0
917,55,Male,Asymptomatic,122,223,1,ST-T Wave Abnormality,100,,0,,1
918,58,Male,Asymptomatic,,385,1,Left Ventricular Hypertrophy,,,,,0


In [8]:
heart_data_cleaned = heart_data.dropna()
heart_data_cleaned

main_heart_data = heart_data_cleaned.copy()

In [9]:
heart_data_cleaned.to_csv('cleaned data/raw data/processed_rawheart.csv')

In [11]:
# Convert 'Sex' and 'ExerciseAngina' to binary format
main_heart_data['Sex'] = main_heart_data['Sex'].map({'Male': 1, 'Female': 0})
main_heart_data['ExerciseAngina'] = main_heart_data['ExerciseAngina'].map({'Yes': 1, 'No': 0})

# Apply one-hot encoding to other categorical columns
columns_to_encode = ['ChestPainType', 'RestingECG', 'ST_Slope']
main_heart_data_encoded = pd.get_dummies(main_heart_data, columns=columns_to_encode)

# Saving the one-hot encoded data to a new CSV file
encoded_data_path = 'cleaned data/raw data/encoded_rawheart.csv' 
main_heart_data_encoded.to_csv(encoded_data_path, index=False)

print("One-hot encoding complete. File saved to:", encoded_data_path)
main_heart_data_encoded

One-hot encoding complete. File saved to: cleaned data/raw data/encoded_rawheart.csv


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_Asymptomatic,ChestPainType_Atypical Angina,ChestPainType_Non-Anginal Pain,ChestPainType_Typical Angina,RestingECG_Left Ventricular Hypertrophy,RestingECG_Normal,RestingECG_ST-T Wave Abnormality,ST_Slope_Downsloping,ST_Slope_Flat,ST_Slope_Upsloping
0,63,,145,233,1,150,,2.3,0,0,0,0,1,1,0,0,1,0,0
1,67,,160,286,0,108,,1.5,1,1,0,0,0,1,0,0,0,1,0
2,67,,120,229,0,129,,2.6,1,1,0,0,0,1,0,0,0,1,0
3,37,,130,250,0,187,,3.5,0,0,0,1,0,0,1,0,1,0,0
4,41,,130,204,0,172,,1.4,0,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,,110,264,0,132,,1.2,1,0,0,0,1,0,1,0,0,1,0
299,68,,144,193,1,141,,3.4,1,1,0,0,0,0,1,0,0,1,0
300,57,,130,131,0,115,,1.2,1,1,0,0,0,0,1,0,0,1,0
301,57,,130,236,0,174,,0.0,1,0,1,0,0,1,0,0,0,1,0
