In [1]:
import pickle #imports the pickle module, which provides functionality for serializing and deserializing Python objects
import sklearn #Scikit-learn is a popular machine learning library in Python that provides various tools for data preprocessing, model selection, and evaluation
'''
'RandomForestClassifier' is an implementation of the random forest algorithm for classification tasks. 
It is a powerful ensemble learning method that combines multiple decision trees to make predictions
'''
from sklearn.ensemble import RandomForestClassifier #imports the 'RandomForestClassifier' class from the 'sklearn.ensemble module'
'''
'train_test_split' is a utility function in 'scikit-learn' that splits a dataset into training and testing subsets. 
It is commonly used for evaluating machine learning models by training them on a portion of the data and 
testing their performance on the remaining data
'''
from sklearn.model_selection import train_test_split #imports the 'train_test_split' function from the 'sklearn.model_selection' module
'''
'accuracy_score' is a metric in 'scikit-learn' that measures the accuracy of a classification model by comparing the predicted labels with the true labels. 
It is commonly used to evaluate the performance of classification models.
'''
from sklearn.metrics import accuracy_score # imports the accuracy_score function from the sklearn.metrics module
import numpy as np # imports the 'numpy' module as a name np
import os #imports the os module, which provides a way to interact with the operating system


data_dict = pickle.load(open('./ASL2.pickle', 'rb')) #The deserialized data of 'ASL.pickle' is assigned to the variable 'data_dict', which now contains the dictionary structure that was saved in the file
data_dir = './Single Hand Sign Datasets' #represents the path to the data directory that contains the dataset
'''
retrieves the value associated with the key 'data' from the 'data_dict' dictionary using the indexing operator []. 
It converts the retrieved data, which is a list or array-like structure, into a 'NumPy' array using the 'np.asarray()'
'''
data_one = np.asarray(data_dict['data'])
'''
this line retrieves the value associated with the key 'labels' from the 'data_dict' dictionary. 
It converts the retrieved data, which is a list or array-like structure, into a 'NumPy' array using 'np.asarray()'
'''
labels1 = np.asarray(data_dict['labels'])

'''
1. 'data_one': The input 'data' (features) to be split. This is a NumPy array or array-like object.
2. 'labels1': The corresponding labels for the input data. This is a NumPy array or array-like object.
3. 'test_size=0.2': The proportion of the data to be used for testing. In this case, it is set to 0.2, 
meaning 20% of the data will be allocated for testing, while 80% will be used for training.
4. 'shuffle=True': This flag indicates whether to shuffle the data before splitting. By setting it to True, 
the data and labels will be randomly shuffled before the split.
5. 'stratify=labels1': It ensures that the distribution of 'labels' in the 'training' and 'testing' sets is similar 
to the original distribution of labels.
6. 'x_train': The training data (features) derived from 'data_one'. This will be used for training the random forest classifier.
7. 'x_test': The testing data (features) derived from 'data_one'. This will be used for evaluating the trained classifier's performance.
8. 'y_train': The training labels derived from 'labels1'. These are the corresponding labels for the training data.
9. 'y_test': The testing labels derived from 'labels1'. These are the corresponding labels for the testing dat 
'''
x_train, x_test, y_train, y_test, = train_test_split(data_one, labels1, test_size=0.2, shuffle=True, stratify=labels1)

'''
The line of code model1 = RandomForestClassifier() creates an instance of the RandomForestClassifier class from scikit-learn and assigns it to the variable model1
'''
model1 = RandomForestClassifier()

'''
The code model1.fit(x_train, y_train) trains the random forest classifier (model1) on the training data and corresponding labels.
1. 'model1': The random forest classifier instance that was previously created using RandomForestClassifier().
2. '.fit()': The 'fit()' method is called on the 'model1' object to train the classifier.
3. 'x_train': The training data (features) that was previously split using 'train_test_split()'. This is a NumPy array or array-like object.
4. 'y_train': The corresponding training labels. These are the target values or classes associated with the training data. Also a NumPy array or array-like object.
'''
model1.fit(x_train, y_train)

'''
The code 'y_predict = model1.predict(x_test)' uses the trained random forest classifier (model1) to make predictions on the testing data (x_test). 
1. 'y_predict': This variable is assigned for the predicted labels generated by the random forest classifier for the testing data.
2. 'model1': The random forest classifier instance that was previously trained using the 'fit()' method.
3. '.predict()': The 'predict()' method is called on the 'model1' object to generate predictions for the input data.
4.  'x_test': The testing data (features) that was previously split using 'train_test_split()'. This is a NumPy array or array-like object.
'''
y_predict = model1.predict(x_test)

'''
The line of code 'score1 = accuracy_score(y_predict, y_test)' calculates the accuracy score of the predictions made by the 
random forest classifier (y_predict) compared to the true labels (y_test):
1. 'score1': This variable is assigns the accuracy score computed based on the 'predicted labels (y_predict)' and the 'true labels (y_test)'.
2. 'accuracy_score()': The 'accuracy_score()' function from 'scikit-learn' is called to calculate the accuracy score.
3. 'y_predict': The predicted labels generated by the random forest classifier for the testing data.
4. 'y_test': The true labels for the testing data.
'''
score1 = accuracy_score(y_predict, y_test)

print('{}% of samples were classified correctly for model1 !'.format(score1 * 100)) #prints the accuracy in percentage format

f = open('model_ASL2.p', 'wb') #saves the model in the .pickle format
pickle.dump({'model1': model1}, f)
f.close()

99.99078510873571% of samples were classified correctly for model1 !
