In [2]:
import pandas as pd 
import numpy as np 


In [3]:
data_raw = pd.read_csv('penguins.csv')
data_raw.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [7]:
data_raw.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [6]:
data_raw = data_raw.dropna(axis=0)

In [9]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.8+ KB


In [11]:
data_raw['sex'].value_counts()

sex
MALE      168
FEMALE    165
Name: count, dtype: int64

In [12]:
data_raw['species'].value_counts()

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

In [13]:
data_raw['island'].value_counts()

island
Biscoe       163
Dream        123
Torgersen     47
Name: count, dtype: int64

In [14]:
data = data_raw.copy()

In [15]:
data.to_csv('data.csv', index=False)

In [16]:
import streamlit as st
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestClassifier

st.title('🐧Penguine Species Prediction')
st.info('Predicting the penguine species using Machine Learning model')

with st.expander('Data'):
    st.write("**Raw data**")
    df = pd.read_csv('data.csv')
    df

    st.write('**X**')
    X_raw = df.drop('species',axis=1)
    X_raw

    st.write('**y**')
    y_raw = df.species
    y_raw


with st.expander('Data Visualization'):
    st.scatter_chart(data=df,x='bill_length_mm',y='body_mass_g',color='species')

# data preparation

with st.sidebar:
    st.header('Input Features')
    island = st.selectbox('Island', ('Biscoe','Dream','Torgersen'))
    sex = st.selectbox('Gender',('MALE','FEMALE'))
    bill_length_mm = st.slider('Bill Length (mm)',32.1,59.6,43.9)

    bill_depth_mm = st.slider('Bill Depth (mm)',13.1,21.5,17.2)
    flipper_length_mm = st.slider('Flipper Length (mm)',172.0,231.0,201.0)
    body_mass_g = st.slider('Body Mass (g)',2700.0,6300.0,4207.0)

    # Create a dataframe for the input feature

    data = {'island': island,
            'bill_length_mm':bill_length_mm,
            'bill_depth_mm': bill_depth_mm,
            'flipper_length_mm' :flipper_length_mm,
            'body_mass_g':body_mass_g,
            'sex' :sex

            
            }
input_df = pd.DataFrame(data,index=[0])
input_penguins = pd.concat([input_df,X_raw],axis=0)


with st.expander('Input features'):
    st.write('**Input Penguins**')
    input_df
    st.write('**Combine Penguins Data**')
    input_penguins

#encode X 
encode = ['island','sex']
df_penguins = pd.get_dummies(input_penguins,prefix=encode)

X = df_penguins[1:]

input_row = df_penguins[:1  ]

# Encode Y 
target_mapper = {'Adelie':0, 'Chinstrap':1 ,'Gentoo':2}

def target_encode(val):
    return target_mapper[val]

y= y_raw.apply(target_encode)


with st.expander('Data Preparation'):
    st.write('**Encoded input penguins (x)**')
    input_row  
    st.write('**Encoded y**')
    y

#Model Training and inference
## Train the ML Model

clf = RandomForestClassifier()
clf.fit(X,y)

## Apply model 

prediction = clf.predict(input_row)
prediction_prob = clf.predict_proba(input_row)

df_prediction_proba = pd.DataFrame(prediction_prob)
df_prediction_proba.columns = ['Adelie', 'Chinstrap','Gentoo']
#df_prediction_proba.rename(columns={0:'Adelie', 1:'Chinstrap' ,2:'Gentoo'})



# Display predicted species 
st.subheader('Predicted Species')
st.dataframe(df_prediction_proba, column_config={
    'Adelie': st.column_config.ProgressColumn('Adelie',format='%f',width='medium',min_value=0,max_value=1),
    'Chinstrap': st.column_config.ProgressColumn('Chinstrap',format='%f',width='medium',min_value=0,max_value=1),
    'Gentoo': st.column_config.ProgressColumn('Gentoo',format='%f',width='medium',min_value=0,max_value=1)
})

penguins_species = np.array(['Adelie', 'Chinstrap','Gentoo'])
st.success(str(penguins_species[prediction][0]))

2025-05-28 23:53:46.638 
  command:

    streamlit run c:\Users\dewan\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-05-28 23:53:47.736 Session state does not function when running a script without `streamlit run`


DeltaGenerator()