In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pyarrow.parquet as pq
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import os

In [2]:
def flattenData(path):
    df = pd.read_parquet(path)
    df = pad_or_truncate(df)
    flatten = df.to_numpy().flatten()
    return flatten

def pad_or_truncate(data):
    if len(data)>50:
        return truncate_start(data)
    elif len(data)<50:
        return pad_end(data)
    else:
        return data
def pad_end(data):
    rows_to_add = 50 - len(data)
    padded_data = {}
    for column in data.columns:
        padded_data[column] = [0] * rows_to_add
    return data.append(pd.DataFrame(padded_data))

def truncate_start(data):
    return data.iloc[:50, :]

In [3]:
pca_files = []
for dirname, _, filenames in os.walk('pca_files'):
    for filename in filenames:
        if filename.endswith('.parquet'):
            pca_files.append(os.path.join(dirname, filename))
len(pca_files)

94477

In [4]:
pca_files[0]

'pca_files/dataset4/asl-signs/train_landmark_files/4718/1160474191.parquet'

In [5]:
train = pd.read_csv("final_train.csv")

In [6]:

y = train['label'].values
X = np.stack([flattenData("pca_files/"+i) for i in tqdm(train['path'])])

100%|██████████| 94477/94477 [04:46<00:00, 329.67it/s]


In [None]:
print(X.shape)
print(y.shape)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# SVM


In [10]:


from sklearn.svm import SVC

clf = SVC(kernel='rbf', decision_function_shape='ovr')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)

# Compute the classification report
cr = classification_report(y_test, y_pred)
print('Classification Report:\n', cr)

Accuracy: 0.01153683319220999
Confusion Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.00      0.00      0.00        43
           2       0.00      0.00      0.00        35
           3       0.00      0.00      0.00        34
           4       0.01      0.02      0.02        44
           5       0.00      0.00      0.00        32
           6       0.00      0.00      0.00        31
           7       0.00      0.00      0.00        29
           8       0.00      0.00      0.00        37
           9       0.00      0.00      0.00        38
          10       0.02      0.06      0.03        36
          11       0.00      0.00      0.00        30
          12       0.03      0.02      0.02        47
          13       0.00      0.00      0.00       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
import json
def read_json(path):
    with open(path, "r") as file:
        json_data = json.load(file)
    return json_data
s2p_map = read_json(os.path.join("sign_to_prediction_index_map.json"))
p2s_map = {v: k for k, v in s2p_map.items()}

encoder = lambda x: s2p_map.get(x)
decoder = lambda x: p2s_map.get(x)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn
df_cm = pd.DataFrame(cm / np.sum(cm, axis=1)[:, None], index = [v for k,v in p2s_map.items()],
                     columns = [v for k,v in p2s_map.items()])
plt.figure(figsize = (250,250))
sn.heatmap(df_cm, annot=True)
plt.savefig('SVM-CM.png')

# Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create Random Forest classifier object
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train classifier on training data
clf.fit(X_train, y_train)

# Predict labels for test data
y_pred = clf.predict(X_test)

# Calculate accuracy of predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.009949195596951736


# KNN

In [15]:
from sklearn.neighbors import KNeighborsClassifier

# Create a K-NN classifier object
clf = KNeighborsClassifier(n_neighbors = 293)

# Fit the model on the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy of predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.009208298052497883


# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a Gradient Boosting classifier object
clf = GradientBoostingClassifier(n_estimators=100)

# Fit the model on the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy of predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


# Naive Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

# Instantiate the scaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the fitted scaler
X_test_scaled = scaler.transform(X_test)


# Create a Naive Bayes classifier object
clf = MultinomialNB()

# Fit the model on the training data
clf.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred = clf.predict(X_test_scaled)

# Calculate accuracy of predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.005080440304826418


# Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression classifier object
clf = LogisticRegression()

# Fit the model on the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy of predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.006773920406435224


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from xgboost import XGBClassifier

# Create the classifier
xgb_clf = XGBClassifier()

# Train the classifier
xgb_clf.fit(X_train, y_train)

# Test the classifier
accuracy = xgb_clf.score(X_test, y_test)
print(f"Accuracy: {accuracy}")


