In [None]:
from IPython.display import Image
Image(filename="images/american_sign_language.PNG", width= 800, height=500)

### The steps demonstrated are:

1. Loading the data
2. Familiarizing with data
3. Converting the pandas Dataframe into Numpy Arrays
4. Visualizing the data
5. Data Preprocessing & EDA
6. Training the data
7. Model Performance Comparision
8. Conclusion

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")
from sklearn import metrics 
from sklearn.decomposition import PCA

## 1. Loading Data

The Dataset is downloaded from https://www.kaggle.com/datamunge/sign-language-mnist . 

The dataset format is patterned to match closely with the classic MNIST. Each training and test case represents a label (0-25) as a one-to-one map for each alphabetic letter A-Z. The training data (27,455 cases) and test data (7172 cases) are approximately half the size of the standard MNIST but otherwise similar with a header row of label,

pixel1,pixel2….pixel784 which represent a single 28x28 pixel image with grayscale values between 0-255.

In [None]:
train_dataframe = pd.read_csv("input/sign_mnist_train.csv")

In [None]:
test_dataframe = pd.read_csv("input/sign_mnist_test.csv")

## 2. Familiarizing with Data

In [None]:
train_dataframe.head()

In [None]:
train_dataframe.shape

In [None]:
train_dataframe.isna().sum()

In [None]:
train_dataframe.info()

In [None]:
train_dataframe.describe().T

In [None]:
test_dataframe.head()

In [None]:
test_dataframe.shape

In [None]:
test_dataframe.info()

In [None]:
test_dataframe.isna().sum()

In [None]:
test_dataframe.describe().T

## 3. Converting the pandas Dataframe into Numpy Arrays

In [None]:
inputs_array_train = train_dataframe.iloc[:, 1:].to_numpy()
targets_array_train = train_dataframe['label'].to_numpy()
inputs_array_test = test_dataframe.iloc[:, 1:].to_numpy()
targets_array_test = test_dataframe['label'].to_numpy()

## 4. Visualizing the data

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x="label",data=train_dataframe);

In [None]:
train_dataframe['label'].value_counts().plot(kind='pie',autopct='%1.1f%%',figsize=(7,7))
plt.title("Label in Total")
plt.show()

In [None]:

def Show_Train_Image(row):
    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(10,10))
    fig.figsize=(1,1)
    img =np.reshape(inputs_array_train[row], (28, 28))
    print("Actual Label : ",chr(ord('A')+targets_array_train[row]))
    ax1.set_title("Original Image")
    ax1.axis("off")
    ax1.imshow(img);
    ax2.axis("off")
    ax2.set_title("Gray Image")
    ax2.imshow(img,cmap='gray');



def Show_Test_Image(row,y_test):
    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(10,10))
    fig.figsize=(1,1)
    img =np.reshape(inputs_array_train[row], (28, 28))
    print("Actual Label : ",chr(ord('A')+targets_array_train[row]))
    print("Predicted Label : ",chr(ord('A')+y_test[row]))
    ax1.set_title("Original Image")
    ax1.axis("off")
    ax1.imshow(img);
    ax2.axis("off")
    ax2.set_title("Gray Image")
    ax2.imshow(img,cmap='gray');

In [None]:
Show_Train_Image(345)

In [None]:
Show_Train_Image(3256)

## 5. Data Preprocessing & EDA

In [None]:
X_train=inputs_array_train/255
X_test=inputs_array_test/255

y_train = targets_array_train  
y_test  = targets_array_test

In [None]:
pca = PCA().fit(X_train)

plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
x = np.arange(1, 785, step=1)
y = np.cumsum(pca.explained_variance_ratio_)

plt.plot(x[:200:5], y[:200:5], marker='o', linestyle='--', color='b')
plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 200, step=5),rotation=90)
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red')

ax.grid(axis='x')
plt.show()

for i,j in zip(x,y):
    if(j>0.95):
        print('The number of components needed to explain variance : ',i)
        break

## 6. Model Building & Training:
   
 The machine learning models considered to train the dataset are:

1. Logistic Regression
2. Decision Tree
3. Support Vector Machine
4. Random Forest
5. Naive Bayes Classifier
6. k-Nearest Neighbors

              
The metrics considered to evaluate the model performance are Accuracy, Classification_report & Confusion Matrix.

In [None]:
ML_Model = []
acc_train = []
acc_test = []


def storeResults(model, a,b):
  ML_Model.append(model)
  acc_train.append(round(a, 3))
  acc_test.append(round(b, 3))

## 6.1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

log = Pipeline([('reduce_dims', PCA(n_components=113)),
                ('model', LogisticRegression(multi_class="multinomial" ,random_state = 0))])

# fit the model 
log.fit(X_train,y_train)

In [None]:
y_train_log = log.predict(X_train)
y_test_log = log.predict(X_test)

#### Performance Evaluation:

In [None]:
acc_train_log = metrics.accuracy_score(y_train,y_train_log)
acc_test_log = metrics.accuracy_score(y_test,y_test_log)

print("Logistic Regression : Accuracy on training Data: {:.3f}".format(acc_train_log))
print("Logistic Regression: Accuracy on test Data: {:.3f}".format(acc_test_log))

In [None]:
print(metrics.classification_report(y_test, y_test_log))

In [None]:
plt.figure(figsize=(10,10))
cm=metrics.confusion_matrix(y_test,y_test_log)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(cmn, annot=True, fmt='.2f',cmap='Oranges')
plt.title("Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show(block=False);

#### Storing Results:

In [None]:
storeResults('Logistic Regression',acc_train_log, acc_test_log)

#### Storing Model:

In [None]:
import joblib
joblib.dump(log,'model/logistic_model')

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(3252,y_test_log)

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(246,y_test_log)

## 6.2. Decision Trees : Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

tree = Pipeline([('reduce_dims', PCA(n_components=113)),
                 ('model', DecisionTreeClassifier(max_depth=30))])

tree.fit(X_train, y_train)

In [None]:
y_train_tree = tree.predict(X_train)
y_test_tree = tree.predict(X_test)

#### Performance Evaluation:

In [None]:
acc_train_tree = metrics.accuracy_score(y_train,y_train_tree)
acc_test_tree = metrics.accuracy_score(y_test,y_test_tree)


print("Decision Tree : Accuracy on training Data: {:.3f}".format(acc_train_tree))
print("Decision Tree : Accuracy on test Data: {:.3f}".format(acc_test_tree))

In [None]:
print(metrics.classification_report(y_test, y_test_tree))

In [None]:
plt.figure(figsize=(10,10))
cm=metrics.confusion_matrix(y_test,y_test_tree)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(cmn, annot=True, fmt='.2f',cmap='Oranges')
plt.title("Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show(block=False);

In [None]:
training_accuracy = []
test_accuracy = []

depth = range(1,31)
for n in depth:
    tree_test = Pipeline([('reduce_dims', PCA(n_components=113)),
                    ('model', DecisionTreeClassifier(max_depth=n))])

    tree_test.fit(X_train, y_train)
  
    training_accuracy.append(tree_test.score(X_train, y_train))
   
    test_accuracy.append(tree_test.score(X_test, y_test))
    

plt.plot(depth, training_accuracy, label="training accuracy")
plt.plot(depth, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")  
plt.xlabel("max_depth")
plt.legend();

#### Storing Results:

In [None]:
storeResults('Decision Tree',acc_train_tree, acc_test_tree)

#### Storing Model:

In [None]:
import joblib
joblib.dump(tree,'model/tree_model')

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(3253,y_test_tree)

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(2356,y_test_tree)

## 6.3. Support Vector Machine : Classifier

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

svc= Pipeline([('reduce_dims', PCA(n_components=113)),('model', SVC())])

svc.fit(X_train,y_train)

In [None]:
y_train_svc = svc.predict(X_train)
y_test_svc = svc.predict(X_test)

#### Performance Evaluation:

In [None]:
acc_train_svc = metrics.accuracy_score(y_train,y_train_svc)
acc_test_svc = metrics.accuracy_score(y_test,y_test_svc)

print("Support Vector Machine : Accuracy on training Data: {:.3f}".format(acc_train_svc))
print("Support Vector Machine : Accuracy on test Data: {:.3f}".format(acc_test_svc))

In [None]:
print(metrics.classification_report(y_test, y_test_svc))

In [None]:
plt.figure(figsize=(10,10))
cm=metrics.confusion_matrix(y_test,y_test_svc)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(cmn, annot=True, fmt='.2f',cmap='Oranges')
plt.title("Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show(block=False);

#### Storing Results:

In [None]:
storeResults('Support Vector Machine',acc_train_svc, acc_test_svc)

#### Storing Model:

In [None]:
import joblib
joblib.dump(svc,'model/SVM_model')

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(1214,y_test_svc)

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(3233,y_test_svc)

## 6.4. Random Forest : Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

forest = Pipeline([('reduce_dims', PCA(n_components=113)),
                ('model', RandomForestClassifier(n_estimators=100))])

forest.fit(X_train,y_train)

In [None]:
y_train_forest = forest.predict(X_train)
y_test_forest = forest.predict(X_test)

#### Performance Evaluation:

In [None]:
acc_train_forest = metrics.accuracy_score(y_train,y_train_forest)
acc_test_forest = metrics.accuracy_score(y_test,y_test_forest)

print("Random Forest : Accuracy on training Data: {:.3f}".format(acc_train_forest))
print("Random Forest : Accuracy on test Data: {:.3f}".format(acc_test_forest))

In [None]:
print(metrics.classification_report(y_test, y_test_forest))

In [None]:
plt.figure(figsize=(10,10))
cm=metrics.confusion_matrix(y_test,y_test_forest)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(cmn, annot=True, fmt='.2f',cmap='Oranges')
plt.title("Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show(block=False);

In [None]:
training_accuracy = []
test_accuracy = []
depth = range(1,102,10)
for n in depth:
    forest_test = Pipeline([('reduce_dims', PCA(n_components=113)),
                       ('model', RandomForestClassifier(n_estimators=n))])

    forest_test.fit(X_train, y_train)

    training_accuracy.append(forest_test.score(X_train, y_train))

    test_accuracy.append(forest_test.score(X_test, y_test))
    

plt.figure(figsize=None)
plt.plot(depth, training_accuracy, label="training accuracy")
plt.plot(depth, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")  
plt.xlabel("n_estimators")
plt.legend()

#### Storing Results:

In [None]:
storeResults('Random Forest',acc_train_forest, acc_test_forest)

#### Storing Model:

In [None]:
import joblib
joblib.dump(forest,'model/forest_model')

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(6532,y_test_forest)

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(32,y_test_forest)

## 6.5. Naive Bayes : Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

nb= Pipeline([('reduce_dims', PCA(n_components=113)),('model', GaussianNB())])

nb.fit(X_train,y_train)

In [None]:
y_train_nb = nb.predict(X_train)
y_test_nb = nb.predict(X_test)

#### Performance Evaluation:

In [None]:
acc_train_nb =  metrics.accuracy_score(y_train,y_train_nb)
acc_test_nb =  metrics.accuracy_score(y_test,y_test_nb)

print("Naive Bayes : Accuracy on training Data: {:.3f}".format(acc_train_nb))
print("Naive Bayes : Accuracy on test Data: {:.3f}".format(acc_test_nb))

In [None]:
print(metrics.classification_report(y_test, y_test_nb))

In [None]:
plt.figure(figsize=(10,10))
cm=metrics.confusion_matrix(y_test,y_test_nb)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(cmn, annot=True, fmt='.2f',cmap='Oranges')
plt.title("Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show(block=False);

#### Storing Results:

In [None]:
storeResults('Naive Bayes',acc_train_nb, acc_test_nb)

#### Storing Model:

In [None]:
import joblib
joblib.dump(nb,'model/naive_bayes_model')

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(6532,y_test_nb)

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(25,y_test_nb)

## 6.6. K-Nearest Neighbors : Classifier


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

knn = Pipeline([('reduce_dims', PCA(n_components=113)),('model', KNeighborsClassifier())])

knn.fit(X_train,y_train)

In [None]:
y_train_knn = knn.predict(X_train)
y_test_knn = knn.predict(X_test)

#### Performance Evaluation:

In [None]:
acc_train_knn = metrics.accuracy_score(y_train,y_train_knn)
acc_test_knn = metrics.accuracy_score(y_test,y_test_knn)

print("K neighest neighbour : Accuracy on training Data: {:.3f}".format(acc_train_knn))
print("K neighest neighbour : Accuracy on test Data: {:.3f}".format(acc_test_knn))

In [None]:
print(metrics.classification_report(y_test, y_test_knn))

In [None]:
plt.figure(figsize=(10,10))
cm=metrics.confusion_matrix(y_test,y_test_knn)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(cmn, annot=True, fmt='.2f',cmap='Oranges')
plt.title("Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show(block=False);

In [None]:
training_accuracy = []
test_accuracy = []

depth = range(1,52,5)
for n in depth:
    knn = Pipeline([('reduce_dims', PCA(n_components=113)),
                    ('model', KNeighborsClassifier(n_neighbors=n))])

    knn.fit(X_train, y_train)

    training_accuracy.append(knn.score(X_train, y_train))

    test_accuracy.append(knn.score(X_test, y_test))
    

plt.plot(depth, training_accuracy, label="training accuracy")
plt.plot(depth, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")  
plt.xlabel("n_neighbors")
plt.legend()


#### Storing Results:

In [None]:
storeResults('K-Nearest Neighbors',acc_train_knn, acc_test_knn)

#### Storing Model:

In [None]:
import joblib
joblib.dump(knn,'model/kneighbors_model')

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(632,y_test_knn)

In [None]:
# Checking Image for Actual and Predicted Label
Show_Test_Image(5432,y_test_knn)

## 7. Comparision of Models:

In [None]:
result = pd.DataFrame({ 'ML Model': ML_Model,
                        'Train Accuracy': acc_train,
                        'Test Accuracy': acc_test,})

In [None]:
result

In [None]:
sorted_result=result.sort_values(by=['Test Accuracy', 'Train Accuracy'],ascending=False).reset_index(drop=True)

In [None]:
sorted_result

## 8. Conclusion:

1. Successfully implement and test various Classifier model and classify Images in American Sign Language.
2. Among all the trained models, Support Vector Machine is giving better accuracy for training as well as testing dataset. 