In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Load the dataset
data = pd.read_csv('/content/PenguinsData.csv')  # Ensure the path to your dataset is correct
data.head()

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,11/16/07,,,,,,,,Adult not sampled.
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,11/16/07,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,


In [2]:
# Clean the dataset (dropping rows with missing values)
cleaned_data = data.dropna(subset=['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)'])
cleaned_data.head()

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,11/16/07,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,
5,PAL0708,6,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A2,Yes,11/16/07,39.3,20.6,190.0,3650.0,MALE,8.66496,-25.29805,


In [3]:
# Convert numeric columns to categorical bins
cleaned_data.loc[:, 'Culmen Length (mm)'] = pd.cut(cleaned_data['Culmen Length (mm)'], bins=3, labels=["short", "medium", "long"])
cleaned_data.loc[:, 'Culmen Depth (mm)'] = pd.cut(cleaned_data['Culmen Depth (mm)'], bins=3, labels=["shallow", "medium", "deep"])
cleaned_data.loc[:, 'Flipper Length (mm)'] = pd.cut(cleaned_data['Flipper Length (mm)'], bins=3, labels=["short", "medium", "long"])
cleaned_data.loc[:, 'Body Mass (g)'] = pd.cut(cleaned_data['Body Mass (g)'], bins=3, labels=["light", "medium", "heavy"])

# One-hot encode the categorical data
one_hot_data = pd.get_dummies(cleaned_data[['Species', 'Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)']])

one_hot_data

Unnamed: 0,Species_Adelie Penguin (Pygoscelis adeliae),Species_Chinstrap penguin (Pygoscelis antarctica),Species_Gentoo penguin (Pygoscelis papua),Culmen Length (mm)_short,Culmen Length (mm)_medium,Culmen Length (mm)_long,Culmen Depth (mm)_shallow,Culmen Depth (mm)_medium,Culmen Depth (mm)_deep,Flipper Length (mm)_short,Flipper Length (mm)_medium,Flipper Length (mm)_long,Body Mass (g)_light,Body Mass (g)_medium,Body Mass (g)_heavy
0,True,False,False,True,False,False,False,True,False,True,False,False,True,False,False
1,True,False,False,True,False,False,False,True,False,True,False,False,True,False,False
2,True,False,False,True,False,False,False,True,False,False,True,False,True,False,False
4,True,False,False,True,False,False,False,False,True,False,True,False,True,False,False
5,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,False,False,True,False,True,False,True,False,False,False,False,True,False,True,False
340,False,False,True,False,True,False,True,False,False,False,False,True,False,True,False
341,False,False,True,False,True,False,True,False,False,False,False,True,False,False,True
342,False,False,True,False,True,False,True,False,False,False,False,True,False,False,True


In [4]:
# Apply the Apriori algorithm
frequent_itemsets = apriori(one_hot_data, min_support=0.1, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.441520,(Species_Adelie Penguin (Pygoscelis adeliae))
1,0.198830,(Species_Chinstrap penguin (Pygoscelis antarct...
2,0.359649,(Species_Gentoo penguin (Pygoscelis papua))
3,0.371345,(Culmen Length (mm)_short)
4,0.500000,(Culmen Length (mm)_medium)
...,...,...
93,0.128655,"(Culmen Depth (mm)_shallow, Species_Gentoo pen..."
94,0.178363,"(Flipper Length (mm)_short, Culmen Depth (mm)_..."
95,0.122807,"(Culmen Depth (mm)_shallow, Flipper Length (mm..."
96,0.175439,"(Flipper Length (mm)_short, Culmen Depth (mm)_..."


In [5]:
# Generate the association rules
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Species_Adelie Penguin (Pygoscelis adeliae)),(Culmen Length (mm)_short),0.441520,0.371345,0.365497,0.827815,2.229233,1.0,0.201541,3.651035,0.987351,0.816993,0.726105,0.906033
1,(Culmen Length (mm)_short),(Species_Adelie Penguin (Pygoscelis adeliae)),0.371345,0.441520,0.365497,0.984252,2.229233,1.0,0.201541,35.463450,0.877135,0.816993,0.971802,0.906033
2,(Species_Adelie Penguin (Pygoscelis adeliae)),(Culmen Depth (mm)_medium),0.441520,0.461988,0.280702,0.635762,1.376142,1.0,0.076724,1.477087,0.489420,0.450704,0.322992,0.621678
3,(Culmen Depth (mm)_medium),(Species_Adelie Penguin (Pygoscelis adeliae)),0.461988,0.441520,0.280702,0.607595,1.376142,1.0,0.076724,1.423222,0.508039,0.450704,0.297369,0.621678
4,(Culmen Depth (mm)_deep),(Species_Adelie Penguin (Pygoscelis adeliae)),0.236842,0.441520,0.154971,0.654321,1.481972,1.0,0.050400,1.615602,0.426155,0.296089,0.381035,0.502657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,"(Culmen Depth (mm)_shallow, Body Mass (g)_medi...","(Species_Gentoo penguin (Pygoscelis papua), Fl...",0.192982,0.283626,0.122807,0.636364,2.243674,1.0,0.068072,1.970029,0.686853,0.347107,0.492393,0.534677
264,"(Culmen Depth (mm)_shallow, Species_Gentoo pen...","(Flipper Length (mm)_long, Culmen Length (mm)_...",0.198830,0.230994,0.122807,0.617647,2.673864,1.0,0.076878,2.011246,0.781369,0.400000,0.502796,0.574646
265,"(Flipper Length (mm)_long, Body Mass (g)_medium)","(Culmen Depth (mm)_shallow, Species_Gentoo pen...",0.131579,0.266082,0.122807,0.933333,3.507692,1.0,0.087796,11.008772,0.823232,0.446809,0.909163,0.697436
266,"(Species_Gentoo penguin (Pygoscelis papua), Bo...","(Culmen Depth (mm)_shallow, Flipper Length (mm...",0.198830,0.190058,0.122807,0.617647,3.249774,1.0,0.085018,2.118309,0.864095,0.461538,0.527925,0.631900


In [6]:
# Filter rules with support > 0.3 and confidence > 0.8
filtered_rules = rules[(rules['support'] > 0.3) & (rules['confidence'] > 0.8)]

# Select only relevant columns: antecedents, support, and confidence
cleaned_output = filtered_rules[['antecedents', 'support', 'confidence']]

cleaned_output

Unnamed: 0,antecedents,support,confidence
0,(Species_Adelie Penguin (Pygoscelis adeliae)),0.365497,0.827815
1,(Culmen Length (mm)_short),0.365497,0.984252
12,(Species_Gentoo penguin (Pygoscelis papua)),0.304094,0.845528
