In [111]:
# taking the data of penguin and classifying into its gender

import pandas as pd
import numpy as np


In [113]:
df = pd.read_csv(r"penguins.csv")

In [115]:
df.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,39.1,18.7,181.0,3750.0,MALE
1,39.5,17.4,186.0,3800.0,FEMALE
2,40.3,18.0,195.0,3250.0,FEMALE
3,,,,,
4,36.7,19.3,193.0,3450.0,FEMALE


In [117]:
# checking the number of rows and columns of the dataset
df.shape

(344, 5)

In [119]:
# checking the unique category of sex column (it seems normal male and female but could be checked)
df.sex.value_counts()

sex
MALE      169
FEMALE    165
.           1
Name: count, dtype: int64

In [121]:
df.isna().sum()

culmen_length_mm     2
culmen_depth_mm      2
flipper_length_mm    2
body_mass_g          2
sex                  9
dtype: int64

In [123]:
# here a . is NaN(not a number) so could be dropped from the data
# lets drop the row that has . in it 
df.dropna(inplace = True) # this  drops the rows of all column where it is NaN and updates df

In [125]:
df.isna().sum() # none of rows has NaN values 

culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [127]:
df["sex"]

0        MALE
1      FEMALE
2      FEMALE
4      FEMALE
5        MALE
        ...  
338    FEMALE
340    FEMALE
341      MALE
342    FEMALE
343      MALE
Name: sex, Length: 335, dtype: object

In [129]:
# # since we are using the Decision Trees NORMALIZATION can be skipped
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["sex"] = label_encoder.fit_transform(df["sex"])

from sklearn.preprocessing import MinMaxScaler
df[["culmen_length_mm" , "culmen_depth_mm", "flipper_length_mm" , "body_mass_g"]] = MinMaxScaler().fit_transform(df[["culmen_length_mm" , "culmen_depth_mm", "flipper_length_mm" , "body_mass_g"]])

In [131]:
print(label_encoder.classes_)
label_encoder.inverse_transform([1,2])

['.' 'FEMALE' 'MALE']


array(['FEMALE', 'MALE'], dtype=object)

In [133]:
# making the x (input) data selcting the features
features = ["culmen_length_mm" , "culmen_depth_mm", "flipper_length_mm" , "body_mass_g"]
x = df[features].values


In [135]:
x[0:5]

array([[0.25454545, 0.66666667, 0.06098987, 0.29166667],
       [0.26909091, 0.51190476, 0.06196415, 0.30555556],
       [0.29818182, 0.58333333, 0.06371785, 0.15277778],
       [0.16727273, 0.73809524, 0.06332814, 0.20833333],
       [0.26181818, 0.89285714, 0.06274357, 0.26388889]])

In [137]:
# making the y(target) column to array
y = df["sex"].values

In [139]:
y[0:5]

array([2, 1, 1, 1, 2])

In [141]:
# splitting to train test dataset
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split( x , y , test_size = 0.30)

In [146]:
# training the data with 4 models to find the model with minimum error

# making RandomForestClassifier model (if it was regression prob we would call RandomForestRegressor)
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier().fit(x_train , y_train) # training model
#testing model
y_pred= model.predict(x_test)
# Evaluation metrics
from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test , y_pred)*100
print(f"The Accuracy of Random Forest model is {acc_score} %")



from sklearn.linear_model import LogisticRegression
model_reg = LogisticRegression().fit(x_train , y_train)
y_lr = model_reg.predict(x_test)

from sklearn.tree import DecisionTreeClassifier
model_DT = DecisionTreeClassifier().fit(x_train , y_train)
y_dt = model_DT.predict(x_test)

from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier(n_neighbors = 5).fit(x_train , y_train)
y_knn = model_knn.predict(x_test)

from sklearn.metrics import accuracy_score 
accy_score1 = accuracy_score(y_test , y_lr)*100
accy_score2 = accuracy_score(y_test , y_dt)*100
accy_score3 = accuracy_score(y_test , y_knn)*100

print(f"The accuracy score of logistic Regression model is {accy_score1} % ")
print(f"The accuracy score of Decision Tree model is {accy_score2} % ")
print(f"The accuracy score of KNN model is {accy_score3} % ")







The Accuracy of Random Forest model is 89.10891089108911 %
The accuracy score of logistic Regression model is 91.0891089108911 % 
The accuracy score of Decision Tree model is 89.10891089108911 % 
The accuracy score of KNN model is 90.0990099009901 % 


In [148]:
# since the random forest has less error we select it 
import joblib 
joblib.dump(model , "RandomForestModel.pkl")

['RandomForestModel.pkl']

In [150]:
# lets move to the vs code and make the app of it
df.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0.254545,0.666667,0.06099,0.291667,2
1,0.269091,0.511905,0.061964,0.305556,1
2,0.298182,0.583333,0.063718,0.152778,1
4,0.167273,0.738095,0.063328,0.208333,1
5,0.261818,0.892857,0.062744,0.263889,2


In [156]:

culmen_length_mm = float(input("Please enter the culmen length (in mm) of the penguin"))
culmen_depth_mm =float ( input("Please enter the culmen depth  (in mm)  of the penguin "))
flipper_length_mm= float( input("Please enter the flipper lenght (in mm) of the penguin" ))
body_mass_g = float (input("Please enter the  body mass(in grams) of the penguin" ))


# applying MinMaxScaler as in real data to be intelligible for the model to work on it as we trained on Scaled data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_input =scaler.fit_transform ([[culmen_length_mm , 	culmen_depth_mm , flipper_length_mm  , body_mass_g	]])


x_data_scaled = np.array(scaled_input).astype("float64")


# making prediction and converting to original catagory
prediction = model.predict(x_data_scaled)

ans_dict = {0:"other"  , 1 : "female" , 2 : "male"}
final_ans = ans_dict[prediction[0]]

button = input("press TRUE ")


# Display the selected option

if(button == "TRUE"):
    
    

    print(x_train[0:5])
    print(x_data_scaled[0:5])
    print("The model is predicting the sex of the penguin")
    print(f"As per the details provided we conclude that the sex of the penguin is : {final_ans}")
   




Please enter the culmen length (in mm) of the penguin 36.7
Please enter the culmen depth  (in mm)  of the penguin  19.3
Please enter the flipper lenght (in mm) of the penguin 193.0
Please enter the  body mass(in grams) of the penguin 3450
press TRUE  TRUE


[[0.35272727 0.75       0.06430242 0.48611111]
 [0.68       0.70238095 0.06664069 0.38888889]
 [0.26909091 0.55952381 0.06235386 0.16666667]
 [0.86545455 0.46428571 0.07014809 0.80555556]
 [0.32727273 0.53571429 0.06118472 0.13888889]]
[[0. 0. 0. 0.]]
The model is predicting the sex of the penguin
As per the details provided we conclude that the sex of the penguin is : female
