In [11]:
"""
1. Load the dataset in Python.
(a) for the Penguin dataset, the features island and sex are strings, therefore they need to be converted
to numerical format to be fed to the MLP. To do so, experiment with 2 methods:
i. convert these features into 1-hot vectors (also known as dummy-coded data)
ii. convert these features into categories yourself
(b) determine if the Abalone dataset can be used as is; otherwise convert any features using the 2 methods
above.
"""
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [12]:
# Load Penguin dataset
penguin_path = r'C:\Users\ibrah\OneDrive\Desktop\COMP472-A1-datasets\penguins.csv'
penguin_data = pd.read_csv(penguin_path)

In [16]:
# Display the first few rows of the dataset
print("Penguin Dataset:")
penguin_data.head()

Penguin Dataset:


Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,MALE
1,Adelie,Torgersen,39.5,17.4,186,3800,FEMALE
2,Adelie,Torgersen,40.3,18.0,195,3250,FEMALE
3,Adelie,Torgersen,36.7,19.3,193,3450,FEMALE
4,Adelie,Torgersen,39.3,20.6,190,3650,MALE


In [18]:
# Method 1: Convert features into 1-hot vectors

categorical_columns = ['island', 'sex']
penguin_categorical = penguin_data[categorical_columns]

# Initialize the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the categorical columns
one_hot_encoded = encoder.fit_transform(penguin_categorical).toarray()

# Create a DataFrame with the one-hot encoded features
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded DataFrame with the original DataFrame, dropping the original categorical columns
penguin_data_encoded = pd.concat([penguin_data, one_hot_df], axis=1).drop(categorical_columns, axis=1)

# Print the modified DataFrame
print("Penguin Dataset after 1-hot encoding:")
penguin_data_encoded.head()

Penguin Dataset after 1-hot encoding:


Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_FEMALE,sex_MALE
0,Adelie,39.1,18.7,181,3750,0.0,0.0,1.0,0.0,1.0
1,Adelie,39.5,17.4,186,3800,0.0,0.0,1.0,1.0,0.0
2,Adelie,40.3,18.0,195,3250,0.0,0.0,1.0,1.0,0.0
3,Adelie,36.7,19.3,193,3450,0.0,0.0,1.0,1.0,0.0
4,Adelie,39.3,20.6,190,3650,0.0,0.0,1.0,0.0,1.0


In [20]:
# Method 2: Convert features into categories manually
# Convert 'island' and 'sex' to categorical data
penguin_data['island'] = penguin_data['island'].astype('category')
penguin_data['sex'] = penguin_data['sex'].astype('category')

# Assign the category codes to the new columns
penguin_data['island_code'] = penguin_data['island'].cat.codes
penguin_data['sex_code'] = penguin_data['sex'].cat.codes

# Display the updated DataFrame
print("Penguin Dataset after converting to categories:")
penguin_data.head()

# shoudl we remove original columns

Penguin Dataset after converting to categories:


Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,island_code,sex_code
0,Adelie,Torgersen,39.1,18.7,181,3750,MALE,2,1
1,Adelie,Torgersen,39.5,17.4,186,3800,FEMALE,2,0
2,Adelie,Torgersen,40.3,18.0,195,3250,FEMALE,2,0
3,Adelie,Torgersen,36.7,19.3,193,3450,FEMALE,2,0
4,Adelie,Torgersen,39.3,20.6,190,3650,MALE,2,1


In [21]:
# Repeat the process for the Abalone dataset
abalone_path = r'C:\Users\ibrah\OneDrive\Desktop\COMP472-A1-datasets\abalone.csv'
abalone_data = pd.read_csv(abalone_path)

In [24]:
# Display the first few rows of the Abalone dataset
print("\nAbalone Dataset:")
abalone_data.head()

# Abalone features are all numerical, so it can be used as is


Abalone Dataset:


Unnamed: 0,Type,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,F,0.605,0.47,0.165,1.1775,0.611,0.2275,0.292,9
1,M,0.55,0.425,0.15,0.8315,0.411,0.1765,0.2165,10
2,M,0.46,0.345,0.11,0.4595,0.235,0.0885,0.116,7
3,F,0.65,0.475,0.165,1.3875,0.58,0.3485,0.3095,9
4,M,0.575,0.47,0.14,0.8375,0.3485,0.1735,0.24,11
