In [75]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [76]:
data=pd.read_csv('../artifacts/gender_classification_v7.csv')

In [77]:
data.head(5)

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


## data preprocessing

In [78]:
data.shape

(5001, 8)

In [79]:
data.isnull().sum()

long_hair                    0
forehead_width_cm            0
forehead_height_cm           0
nose_wide                    0
nose_long                    0
lips_thin                    0
distance_nose_to_lip_long    0
gender                       0
dtype: int64

In [80]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   gender                     5001 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 312.7+ KB


In [152]:
data['gender'].value_counts()

gender
Female    2501
Male      2500
Name: count, dtype: int64

In [155]:
x = data.drop('gender', axis=1)
y=data['gender']

In [173]:
categorical_cols = ['long_hair', 'nose_wide', 'nose_long', 'lips_thin', 'distance_nose_to_lip_long']
x_encoded = pd.get_dummies(x, columns=categorical_cols)

In [183]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_encoded, y, test_size=0.25)

In [184]:
print(x.shape,x_test.shape)

(5001, 7) (1251, 12)


In [187]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9688249400479616
Classification Report:
              precision    recall  f1-score   support

      Female       0.97      0.97      0.97       632
        Male       0.97      0.97      0.97       619

    accuracy                           0.97      1251
   macro avg       0.97      0.97      0.97      1251
weighted avg       0.97      0.97      0.97      1251



In [188]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9720223820943246
Classification Report:
              precision    recall  f1-score   support

      Female       0.97      0.98      0.97       632
        Male       0.98      0.97      0.97       619

    accuracy                           0.97      1251
   macro avg       0.97      0.97      0.97      1251
weighted avg       0.97      0.97      0.97      1251



In [189]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9672262190247801
Classification Report:
              precision    recall  f1-score   support

      Female       0.96      0.98      0.97       632
        Male       0.98      0.96      0.97       619

    accuracy                           0.97      1251
   macro avg       0.97      0.97      0.97      1251
weighted avg       0.97      0.97      0.97      1251



In [190]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9568345323741008
Classification Report:
              precision    recall  f1-score   support

      Female       0.96      0.96      0.96       632
        Male       0.96      0.96      0.96       619

    accuracy                           0.96      1251
   macro avg       0.96      0.96      0.96      1251
weighted avg       0.96      0.96      0.96      1251



In [193]:
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Train and evaluate classifiers
results = {}
for name, clf in classifiers.items():
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'classification_report': classification_report(y_test, y_pred)
    }

# Print results
for name, result in results.items():
    print(f"{name}:")
    print(f"Accuracy: {result['accuracy']}")
    print("Classification Report:")
    print(result['classification_report'])
    print("\n")

Random Forest:
Accuracy: 0.9688249400479616
Classification Report:
              precision    recall  f1-score   support

      Female       0.97      0.97      0.97       632
        Male       0.97      0.97      0.97       619

    accuracy                           0.97      1251
   macro avg       0.97      0.97      0.97      1251
weighted avg       0.97      0.97      0.97      1251



SVM:
Accuracy: 0.9720223820943246
Classification Report:
              precision    recall  f1-score   support

      Female       0.97      0.98      0.97       632
        Male       0.98      0.97      0.97       619

    accuracy                           0.97      1251
   macro avg       0.97      0.97      0.97      1251
weighted avg       0.97      0.97      0.97      1251



KNN:
Accuracy: 0.9704236610711431
Classification Report:
              precision    recall  f1-score   support

      Female       0.97      0.98      0.97       632
        Male       0.98      0.96      0.97       61

In [194]:
import pickle
with open('predictor.pickle', 'wb') as file:
    pickle.dump(rf, file)

In [195]:
column_names = x_test.columns.tolist()
print(column_names)

['forehead_width_cm', 'forehead_height_cm', 'long_hair_0', 'long_hair_1', 'nose_wide_0', 'nose_wide_1', 'nose_long_0', 'nose_long_1', 'lips_thin_0', 'lips_thin_1', 'distance_nose_to_lip_long_0', 'distance_nose_to_lip_long_1']


In [196]:
pred_value = rf.predict([[14.2,5.9,1,0,1,0,1,1,0,1,0,1]])
pred_value



array(['Male'], dtype=object)

In [198]:
pred_value1 = rf.predict([[12.8,5.8,0,1,1,0,1,0,1,0,1,0]])
pred_value1



array(['Female'], dtype=object)

In [177]:
data.groupby('gender').mean()

Unnamed: 0_level_0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,0.873251,12.811675,5.796321,0.114754,0.135946,0.121551,0.121551
Male,0.866,13.55144,6.09636,0.8732,0.88,0.8648,0.8764
