In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pyarrow.parquet as pq
from tqdm import tqdm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
data=[]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename.endswith('.parquet'):
            data.append(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import warnings


#Finding the ideal value for n_components that fits all the parquet files with 90% variance retained

def expectedVarianceRatioCumSum(data):
    preprocessing_pipeline = make_pipeline(StandardScaler(), SimpleImputer(strategy= 'constant',fill_value = 0))
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        scaled_data = preprocessing_pipeline.fit_transform(data)
    pca = PCA(n_components = min(scaled_data.shape[0],scaled_data.shape[1]))
    pca.fit(scaled_data)
    return pca.explained_variance_ratio_.cumsum()
    

def find_best_n_components():
    variances = []
    for path in tqdm(data):
        df = pd.read_parquet(path)
        pivoted = preprocess_data(df)
        variances.append(expectedVarianceRatioCumSum(pivoted))
    return variances


In [9]:
variances = find_best_n_components()
len(variances)

100%|██████████| 94477/94477 [1:58:19<00:00, 13.31it/s]  


94477

In [10]:
components = []
for variance in variances:
    for i in range(len(variance)):
        if variance[i]>0.9:
            components.append(i+1)
            break

In [11]:
len(components)

94477

In [12]:
import statistics
mode = statistics.mode(components)
maxi = max(components)
print(f'Maximum n-component for entire dataset: {maxi}\n Most frequent n-component for entire dataset:{mode}')

Maximum n-component for entire dataset: 13
 Most frequent n-component for entire dataset:5


In [13]:
def padZeros(data):
    rows_to_add = maxi - len(data)
    padded_data = {}
    for column in data.columns:
        padded_data[column] = [0] * rows_to_add
    return data.append(pd.DataFrame(padded_data))

def scaleData(data):
    preprocessing_pipeline = make_pipeline(StandardScaler(), SimpleImputer(strategy= 'constant',fill_value = 0))
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        scaled_data = pd.DataFrame(preprocessing_pipeline.fit_transform(data))
    return scaled_data

def preprocess_data(data):
    #dropping most of the face landmarks
    data = data.drop(["z"],axis=1)
    keep_indices = [0, 9, 11, 13, 14, 17, 117, 118, 119, 199, 346, 347, 348]
    mask = (data['type'] == 'face') & (~data['landmark_index'].isin(keep_indices))
    data.drop(data[mask].index,inplace=True)
    #dropping unnecessary columns
    data = data.drop(["row_id","type","landmark_index"], axis=1)
    #pivoting data such that each frame can be represented as each row
    data['index'] = data.groupby('frame').cumcount()
    pivoted = data.pivot(index='frame', columns='index',values=['x','y'])
    new_cols=[name[0]+str(name[1]) for name in pivoted.columns]
    pivoted.columns=new_cols
    data = data.drop(['index'],axis=1)
    return pivoted
    

def applyPCA(data):
    data = preprocess_data(data)
    preprocessing_pipeline = make_pipeline(StandardScaler(), SimpleImputer(strategy= 'constant',fill_value = 0))
    scaled_data = scaleData(data)
    if len(scaled_data<maxi):
        scaled_data = padZeros(scaled_data)
    pca = PCA(n_components = maxi)
    pca.fit(scaled_data)
    pca_data = pca.transform(scaled_data)
    return pd.DataFrame(pca_data)
    
    
    

In [14]:
input_dir = '/kaggle/input/asl-signs/train_landmark_files'
output_dir = 'pca_files/train_landmark_files'
for directory in tqdm(data):
    # extract the common part of the file path
    path_parts = directory.split("/")
    frame_num = path_parts[-2]
    file_name = path_parts[-1]
    input_file_path = os.path.join(input_dir, frame_num, file_name)
    output_file_path = os.path.join(output_dir, frame_num, file_name)
    
    # read the parquet file into a pandas dataframe
    df = pd.read_parquet(directory)
    
    # preprocess the data
    new_data = applyPCA(df)
    
    new_cols=["z"+str(name) for name in new_data.columns]
    new_data.columns = new_cols
    
    # save the data to a new directory
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    new_data.to_parquet(output_file_path)
    

100%|██████████| 94477/94477 [2:32:04<00:00, 10.35it/s]  


In [None]:
import os
from IPython.display import FileLink

# Navigate to the directory where the output folder is located
os.chdir('/kaggle/working/')

# Zip the output folder
!zip -r pca_files1.zip pca_files

# Create a download link for the zipped folder
FileLink(r'pca_files1.zip')

In [89]:
pca_files[948]

'/kaggle/working/pca_files/train_landmark_files/32319/664221861.parquet'

In [4]:
pca_files = []
for dirname, _, filenames in os.walk('/kaggle/input/pca-data'):
    for filename in filenames:
        if filename.endswith('.parquet'):
            pca_files.append(os.path.join(dirname, filename))
len(pca_files)

94477

In [5]:
train = pd.read_csv("/kaggle/input/asl-signs/train.csv")
train

Unnamed: 0,path,participant_id,sequence_id,sign
0,train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow
1,train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait
2,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud
3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird
4,train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie
...,...,...,...,...
94472,train_landmark_files/53618/999786174.parquet,53618,999786174,white
94473,train_landmark_files/26734/999799849.parquet,26734,999799849,have
94474,train_landmark_files/25571/999833418.parquet,25571,999833418,flower
94475,train_landmark_files/29302/999895257.parquet,29302,999895257,room


In [6]:
import json
def read_json(path):
    with open(path, "r") as file:
        json_data = json.load(file)
    return json_data
s2p_map = read_json(os.path.join("/kaggle/input/asl-signs/sign_to_prediction_index_map.json"))
p2s_map = {v: k for k, v in s2p_map.items()}

encoder = lambda x: s2p_map.get(x)
decoder = lambda x: p2s_map.get(x)

train["label"] =train["sign"].map(encoder)

In [7]:
train = train.drop(["participant_id","sequence_id","sign"],axis=1)
train

Unnamed: 0,path,label
0,train_landmark_files/26734/1000035562.parquet,25
1,train_landmark_files/28656/1000106739.parquet,232
2,train_landmark_files/16069/100015657.parquet,48
3,train_landmark_files/25571/1000210073.parquet,23
4,train_landmark_files/62590/1000240708.parquet,164
...,...,...
94472,train_landmark_files/53618/999786174.parquet,238
94473,train_landmark_files/26734/999799849.parquet,108
94474,train_landmark_files/25571/999833418.parquet,86
94475,train_landmark_files/29302/999895257.parquet,188


In [8]:
def flattenData(path):
    df = pd.read_parquet(path)
    df = pad_or_truncate(df)
    flatten = df.to_numpy().flatten()
    return flatten

def pad_or_truncate(data):
    if len(data)>50:
        return truncate_start(data)
    elif len(data)<50:
        return pad_end(data)
    else:
        return data
def pad_end(data):
    rows_to_add = 50 - len(data)
    padded_data = {}
    for column in data.columns:
        padded_data[column] = [0] * rows_to_add
    return data.append(pd.DataFrame(padded_data))

def truncate_start(data):
    return data.iloc[:50, :]



In [99]:
z.shape

(5, 650)

In [9]:

y = train['label'].values
X = np.stack([flattenData("/kaggle/input/pca-data/pca_files/"+i) for i in tqdm(train['path'])])

100%|██████████| 94477/94477 [11:37<00:00, 135.41it/s]


In [10]:
print(X.shape)
print(y.shape)

(94477, 650)
(94477,)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

clf = SVC(kernel='rbf', decision_function_shape='ovr')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)





In [None]:
from sklearn.metrics import confusion_matrix, classification_report

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)

# Compute the classification report
cr = classification_report(y_test, y_pred)
print('Classification Report:\n', cr)

In [None]:
df_cm = pd.DataFrame(cf_matrix / np.sum(cf_matrix, axis=1)[:, None], index = [v for k,v in p2s_map.items()],
                     columns = [v for k,v in p2s_map.items()])
plt.figure(figsize = (250,250))
sn.heatmap(df_cm, annot=True)
plt.savefig('SVM-CM.png')