In [61]:
!pwd
!ls


/content
penguins.csv  sample_data


In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [63]:
df = pd.read_csv('/content/penguins.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   species                 333 non-null    object 
 1   island                  334 non-null    object 
 2   calorie requirement     344 non-null    int64  
 3   average sleep duration  344 non-null    int64  
 4   bill_length_mm          337 non-null    float64
 5   bill_depth_mm           333 non-null    float64
 6   flipper_length_mm       336 non-null    float64
 7   body_mass_g             339 non-null    float64
 8   gender                  327 non-null    object 
 9   year                    342 non-null    float64
dtypes: float64(5), int64(2), object(3)
memory usage: 27.0+ KB


In [64]:
Y = df['gender'].str.strip().str.lower()

In [65]:
class_counts = df['gender'].value_counts()
print(class_counts)


gender
male      164
female    160
FEMALE      2
MALE        1
Name: count, dtype: int64


In [66]:
df = df.dropna()
df.shape

(305, 10)

In [67]:
df = df[df['gender'].isin(['male', 'female'])]
Y = df['gender'].apply(lambda x: 1 if x == 'male' else 0).values.reshape(-1, 1)

In [68]:
df['species'] = df['species'].str.strip().str.lower()
df['island'] = df['island'].str.strip().str.lower()

species_dummies = pd.get_dummies(df['species'], prefix='species')
island_dummies = pd.get_dummies(df['island'], prefix='island')

numeric_features = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]

X = pd.concat([species_dummies, island_dummies, numeric_features], axis=1).values


In [69]:
print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)

Shape of X: (305, 10)
Shape of Y: (305, 1)


In [70]:
N = X.shape[0]

indices = np.random.permutation(N)

X_shuffled = X[indices]
Y_shuffled = Y[indices]


In [71]:
split_index = int(0.8 * N)

X_train = X_shuffled[:split_index]
y_train = Y_shuffled[:split_index]

X_test = X_shuffled[split_index:]
y_test = Y_shuffled[split_index:]


In [72]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (244, 10)
y_train shape: (244, 1)
X_test shape: (61, 10)
y_test shape: (61, 1)
