In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
data = pd.read_csv('gender_classification_v7.csv')

In [None]:
data.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


In [None]:
data.shape

(5001, 8)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   gender                     5001 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 312.7+ KB


In [None]:
data.duplicated().sum()

1768

In [None]:
data = data.drop_duplicates()

In [None]:
data.duplicated().sum()

0

In [None]:
data.isnull().sum()

Unnamed: 0,0
long_hair,0
forehead_width_cm,0
forehead_height_cm,0
nose_wide,0
nose_long,0
lips_thin,0
distance_nose_to_lip_long,0
gender,0


In [None]:
data['gender'].value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
Male,1783
Female,1450


In [None]:
data.shape

(3233, 8)

In [None]:
data.columns

Index(['long_hair', 'forehead_width_cm', 'forehead_height_cm', 'nose_wide',
       'nose_long', 'lips_thin', 'distance_nose_to_lip_long', 'gender'],
      dtype='object')

In [None]:
result = pd.crosstab(data['gender'], data['long_hair'])
# result.columns = ['No Long Hair (0)', 'Has Long Hair (1)']
print(result)

long_hair    0     1
gender              
Female     266  1184
Male       308  1475


In [None]:
result = pd.crosstab(data['gender'], data['nose_wide'])
print(result)

nose_wide     0     1
gender               
Female     1203   247
Male        296  1487


In [None]:
result = pd.crosstab(data['gender'], data['nose_long'])
print(result)

nose_long     0     1
gender               
Female     1151   299
Male        279  1504


In [None]:
result = pd.crosstab(data['gender'], data['lips_thin'])
print(result)

lips_thin     0     1
gender               
Female     1175   275
Male        317  1466


In [None]:
result = pd.crosstab(data['gender'], data['distance_nose_to_lip_long'])
print(result)

distance_nose_to_lip_long     0     1
gender                               
Female                     1177   273
Male                        293  1490


In [None]:
# shuffle data
data = data.sample(frac=1)
data = data.reset_index(drop=True)  # reet index after shuffling

In [None]:
data.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,0,12.0,6.4,1,1,1,1,Male
1,0,15.5,5.6,0,1,1,1,Male
2,1,13.1,5.9,1,1,1,1,Male
3,0,14.2,5.4,0,0,0,0,Female
4,1,12.4,6.8,1,1,0,1,Male


In [None]:
X = data.drop('gender', axis=1) # use all columns except 'target' as features
Y = data['gender']

In [None]:
X.shape, Y.shape

((3233, 7), (3233,))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [None]:
param_grid = {
    'n_estimators': [50, 75, 100, 150],         # Focus on fewer trees due to dataset size
    'max_depth': [3, 5, 7, 10, 50],         # Explore shallow and fully grown trees
    'max_features': ['sqrt', 'log2'],     # Use subsets of features for splits
    'min_samples_split': [2, 5, 10]       # Control node splits
}

In [None]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)

In [None]:
# Perform Grid Search
grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


In [None]:
# Extract results
results = pd.DataFrame(grid_search.cv_results_)

In [None]:
# Select relevant columns for meta-dataset
meta_dataset = results[['param_n_estimators', 'param_max_depth', 'param_max_features', 'param_min_samples_split',
                        'mean_test_score']]

In [None]:
# Rename columns for clarity
meta_dataset.rename(columns={
    'param_n_estimators': 'n_estimators',
    'param_max_depth': 'max_depth',
    'param_max_features': 'max_features',
    'param_min_samples_split': 'min_samples_split',
    'mean_test_score': 'accuracy'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_dataset.rename(columns={


In [None]:
# Add meta-features of the dataset
meta_features = {
    'num_features': X.shape[1],
    'num_samples': X.shape[0],
    'class_imbalance_ratio': Y.value_counts().max() / Y.value_counts().min(),
    'num_classes': len(Y.unique())
}

In [None]:
for feature, value in meta_features.items():
    meta_dataset[feature] = value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_dataset[feature] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_dataset[feature] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_dataset[feature] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 

In [None]:
# Display the meta-dataset
print(meta_dataset.head())

   n_estimators  max_depth max_features  min_samples_split  accuracy  \
0            50          3         sqrt                  2  0.951835   
1            75          3         sqrt                  2  0.950952   
2           100          3         sqrt                  2  0.951394   
3           150          3         sqrt                  2  0.951396   
4            50          3         sqrt                  5  0.951835   

   num_features  num_samples  class_imbalance_ratio  num_classes  
0             7         3233               1.229655            2  
1             7         3233               1.229655            2  
2             7         3233               1.229655            2  
3             7         3233               1.229655            2  
4             7         3233               1.229655            2  


In [None]:
# Save the meta-dataset to a CSV file for later use
meta_dataset.to_csv('meta_dataset_gender.csv', index=False)