In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import mutual_info_score

df = pd.read_csv('/home/thalesmansur/ml.school/data/penguins.csv')


In [2]:
df['sex'] = df['sex'].fillna('missing')
df['sex_encoded'] = df['sex'].astype('category').cat.codes
df['island'] = df['island'].astype('category').cat.codes
df['species_encoded'] = df['species'].astype('category').cat.codes

df=df.dropna()

X = df.drop(columns=['species', 'species_encoded','sex','sex_encoded'])  # Without sex column
X_with_sex =  df.drop(columns=['species', 'species_encoded','sex'])  # With sex column
y = df['species_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_sex, X_test_sex, y_train_sex, y_test_sex = train_test_split(X_with_sex, y, test_size=0.2, random_state=42)



In [3]:
# Without 'sex'
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print("Without 'sex':", accuracy_score(y_test, predictions))

# With 'sex'
model_with_sex = LogisticRegression(max_iter=10000)
model_with_sex.fit(X_train_sex, y_train_sex)
predictions_with_sex = model_with_sex.predict(X_test_sex)
print("With 'sex':", accuracy_score(y_test_sex, predictions_with_sex))


Without 'sex': 0.9855072463768116
With 'sex': 1.0


In [14]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, predictions_with_sex))


[[35  0  0]
 [ 0 12  0]
 [ 0  0 22]]


In [4]:
display(X_train)
display(X_train_sex)

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
115,0,42.7,18.3,196.0,4075.0
8,2,34.1,18.1,193.0,3475.0
138,1,37.0,16.5,185.0,3400.0
333,0,51.5,16.3,230.0,5500.0
305,0,50.8,17.3,228.0,5600.0
...,...,...,...,...,...
189,1,52.0,20.7,210.0,4800.0
72,2,39.6,17.2,196.0,3550.0
107,0,38.2,20.0,190.0,3900.0
271,0,48.5,14.1,220.0,5300.0


Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_encoded
115,0,42.7,18.3,196.0,4075.0,2
8,2,34.1,18.1,193.0,3475.0,3
138,1,37.0,16.5,185.0,3400.0,1
333,0,51.5,16.3,230.0,5500.0,2
305,0,50.8,17.3,228.0,5600.0,2
...,...,...,...,...,...,...
189,1,52.0,20.7,210.0,4800.0,2
72,2,39.6,17.2,196.0,3550.0,1
107,0,38.2,20.0,190.0,3900.0,2
271,0,48.5,14.1,220.0,5300.0,2


In [5]:
print("Classification Report without 'sex':\n", classification_report(y_test, predictions))
print("Classification Report with 'sex':\n", classification_report(y_test_sex, predictions_with_sex))


Classification Report without 'sex':
               precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       0.92      1.00      0.96        12
           2       1.00      1.00      1.00        22

    accuracy                           0.99        69
   macro avg       0.97      0.99      0.98        69
weighted avg       0.99      0.99      0.99        69

Classification Report with 'sex':
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00        22

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



### Test

In [6]:
check_df = pd.concat([X_test.reset_index(),y_test.reset_index(), pd.Series(predictions, name='predicted')], axis=1)
check_df[check_df['species_encoded'] != check_df['predicted']]

Unnamed: 0,index,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,index.1,species_encoded,predicted
65,73,2,45.8,18.9,197.0,4150.0,73,0,1


In [7]:
check_df_sex = pd.concat([X_test_sex.reset_index(),y_test_sex.reset_index(), pd.Series(predictions_with_sex, name='predicted')], axis=1)
check_df_sex[check_df_sex['species_encoded'] != check_df_sex['predicted']]

Unnamed: 0,index,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_encoded,index.1,species_encoded,predicted


In [8]:
df.loc[73]

species              Adelie
island                    2
culmen_length_mm       45.8
culmen_depth_mm        18.9
flipper_length_mm     197.0
body_mass_g          4150.0
sex                    MALE
sex_encoded               2
species_encoded           0
Name: 73, dtype: object

In [9]:
df[(df['species_encoded'] == 0) & (df['sex']=='MALE')].describe(include='all')

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,sex_encoded,species_encoded
count,73,73.0,73.0,73.0,73.0,73.0,73,73.0,73.0
unique,1,,,,,,1,,
top,Adelie,,,,,,MALE,,
freq,73,,,,,,73,,
mean,,1.013699,40.390411,19.072603,192.410959,4043.493151,,2.0,0.0
std,,0.790449,2.277131,1.018886,6.599317,346.811553,,0.0,0.0
min,,0.0,34.6,17.0,178.0,3325.0,,2.0,0.0
25%,,0.0,39.0,18.5,189.0,3800.0,,2.0,0.0
50%,,1.0,40.6,18.9,193.0,4000.0,,2.0,0.0
75%,,2.0,41.5,19.6,197.0,4300.0,,2.0,0.0


In [10]:
df[(df['species_encoded'] == 1) & (df['sex']=='FEMALE')].describe(include='all')

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,sex_encoded,species_encoded
count,34,34.0,34.0,34.0,34.0,34.0,34,34.0,34.0
unique,1,,,,,,1,,
top,Chinstrap,,,,,,FEMALE,,
freq,34,,,,,,34,,
mean,,1.0,46.573529,17.588235,191.735294,3527.205882,,1.0,1.0
std,,0.0,3.108669,0.781128,5.754096,285.333912,,0.0,0.0
min,,1.0,40.9,16.4,178.0,2700.0,,1.0,1.0
25%,,1.0,45.425,17.0,187.25,3362.5,,1.0,1.0
50%,,1.0,46.3,17.65,192.0,3550.0,,1.0,1.0
75%,,1.0,47.375,18.05,195.75,3693.75,,1.0,1.0


In [11]:
from sklearn.metrics import mutual_info_score

# Calculate mutual information
mi_score = mutual_info_score(predictions, predictions_with_sex)

mi_score

0.9618679861478048