<a href="https://colab.research.google.com/github/Scoobyz95/RandomForest_project/blob/main/RandomForest_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
sns.set()

%matplotlib inline

# Graphic setup
rcParams['figure.figsize'] = 20, 10

In [2]:
# I take the dataset from GitHub
url = "https://raw.githubusercontent.com/Scoobyz95/RandomForest_project/main/vocal_gender_features_new.csv"
df = pd.read_csv(url)

In [3]:
# Check if there are missing data
df.isnull().values.any()


False

In [4]:
# Check the size of the dataframe
df.shape


(16148, 44)

In [5]:
# List of features to use for training the model
features = ['mean_spectral_centroid', 'std_spectral_centroid', 'mean_spectral_bandwidth', 'std_spectral_bandwidth',
            'mean_spectral_contrast', 'mean_spectral_flatness', 'mean_spectral_rolloff', 'zero_crossing_rate',
            'rms_energy', 'mean_pitch', 'min_pitch', 'max_pitch', 'std_pitch', 'spectral_skew', 'spectral_kurtosis',
            'energy_entropy', 'log_energy', 'mfcc_1_mean', 'mfcc_1_std', 'mfcc_2_mean', 'mfcc_2_std', 'mfcc_3_mean',
            'mfcc_3_std', 'mfcc_4_mean', 'mfcc_4_std', 'mfcc_5_mean', 'mfcc_5_std', 'mfcc_6_mean', 'mfcc_6_std',
            'mfcc_7_mean', 'mfcc_7_std', 'mfcc_8_mean', 'mfcc_8_std', 'mfcc_9_mean', 'mfcc_9_std', 'mfcc_10_mean',
            'mfcc_10_std', 'mfcc_11_mean', 'mfcc_11_std', 'mfcc_12_mean', 'mfcc_12_std', 'mfcc_13_mean',
            'mfcc_13_std']

# Label to predict (male or female)
etichetta = 'label'

# Create the variable X (features), taking only the columns specified in the 'features' list
X = df[features]
# Create the variable Y (label), taking the column specified in 'label'
Y = df[etichetta]

In [6]:
from sklearn.model_selection import train_test_split   # Import the function to split the data into training and test sets
# Split the dataset into training and test data
train_X,test_X,train_Y,test_Y = train_test_split(X,Y,test_size = 0.2,random_state= 45)
# - X: the features of the dataset
# - Y: the label (target) of the dataset
# - test_size=0.2: indicates that 20% of the data will be used for testing, while 80% will be used for training
# - random_state=45: sets a random seed to ensure the split is reproducible

In [7]:
from sklearn.ensemble import RandomForestClassifier # Import the Random Forest classifier from the scikit-learn library

In [8]:
Metrica = 'gini' # Define the impurity metric for decision trees (Gini impurity is a common choice)

num_estimators = 100 # Set the number of trees to generate in the forest (100 trees)
no_jobs = 4  # Define the number of threads to use during model training (use 4 processors in parallel)

# Create an instance of the RandomForestClassifier model with the specified parameters
clf = RandomForestClassifier(n_jobs=no_jobs, random_state=42, criterion= Metrica, n_estimators= 100, verbose = True)
# - n_jobs=no_jobs: sets the number of cores to use for training (4 in your case)
# - random_state=42: sets a seed to ensure the reproducibility of the results
# - criterion=Metrica: uses the 'gini' (or 'entropy') metric to determine the quality of the split in the trees
# - n_estimators=num_estimators: sets the number of trees in the forest (100 trees)
# - verbose=True: prints detailed information during training to monitor the process

In [9]:
clf.fit(train_X, train_Y) # Train the Random Forest model using the training data (train_X) and labels (train_Y)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    8.7s finished


In [10]:
preds = clf.predict(test_X)  # Use the trained model to make predictions on the test data (test_X)
preds

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


array([0, 1, 1, ..., 1, 0, 0])

In [11]:
from sklearn.metrics import confusion_matrix  # Import the confusion_matrix function from the scikit-learn library

# Calculate the confusion matrix and decompose it into its components
(tn, fp, fn, tp) = confusion_matrix(test_Y, preds).ravel()
# - confusion_matrix(test_Y, preds): calculates the confusion matrix by comparing the actual labels (test_Y) with the predictions (preds)
# - .ravel(): converts the 2x2 matrix into a one-dimensional array (a vector) containing [tn, fp, fn, tp]
#   where:
#   - tn (True Negative): the number of correct negative predictions
#   - fp (False Positive): the number of incorrect negative predictions
#   - fn (False Negative): the number of incorrect positive predictions
#   - tp (True Positive): the number of correct positive predictions
(tn, fp, fn, tp)


(1154, 10, 4, 2062)

In [12]:
matrix = confusion_matrix(test_Y, preds)  # Calculate the confusion matrix by comparing the actual labels (test_Y) with the predictions (preds)
matrix


array([[1154,   10],
       [   4, 2062]])

In [13]:
from sklearn.metrics import precision_score, recall_score  # Import the functions to calculate precision and recall from scikit-learn

# Calculate precision (the ratio of true positives to all predicted positives) and round the result to 2 decimal places
precision_score = round(precision_score(test_Y, preds), 2)

# Calculate recall (the ratio of true positives to all actual positives) and round the result to 2 decimal places
recall_score = round(recall_score(test_Y, preds), 2)

# Print the precision
print("Precision: " + str(precision_score))

# Print the recall
print("Recall: " + str(recall_score))


Precision: 1.0
Recall: 1.0
