## Save Training Set and Testing Set
Clean the dataset and seperate into training and testing sets. The testing dataset would be used to evalute all models, ensure consistency.

### 1. Loading the Data

In [None]:
import os

# Change to desired directory where pickled file of sliced dataset are saved
os.chdir('/save/to/path')

# Confirm it's changed
print("Current directory:", os.getcwd())

In [2]:
import pickle

with open('./dfwords_0_20000.pkl', 'rb') as file:
    loaded_dfwords1 = pickle.load(file)

with open('./dfwords_20000_40000.pkl', 'rb') as file:
    loaded_dfwords2 = pickle.load(file)

with open('./dfwords_40000_60000.pkl', 'rb') as file:
    loaded_dfwords3 = pickle.load(file)

with open('./dfwords_60000_80000.pkl', 'rb') as file:
    loaded_dfwords4 = pickle.load(file)

with open('./dfwords_80000_100000.pkl', 'rb') as file:
    loaded_dfwords5 = pickle.load(file)

with open('./dfwords_100000_120000.pkl', 'rb') as file:
    loaded_dfwords6 = pickle.load(file)

with open('./dfwords_120000_140000.pkl', 'rb') as file:
    loaded_dfwords7 = pickle.load(file)

with open('./dfwords_140000_160000.pkl', 'rb') as file:
    loaded_dfwords8 = pickle.load(file)

with open('./dfwords_160000_180000.pkl', 'rb') as file:
    loaded_dfwords9 = pickle.load(file)

with open('./dfwords_180000_200000.pkl', 'rb') as file:
    loaded_dfwords10 = pickle.load(file)

with open('./dfwords_200000_227055.pkl', 'rb') as file:
    loaded_dfwords11 = pickle.load(file)

In [3]:
import pandas as pd

loaded_dfwords = pd.concat([loaded_dfwords1, loaded_dfwords2], ignore_index=True)  # vertical stack
loaded_dfwords = pd.concat([loaded_dfwords, loaded_dfwords3], ignore_index=True)  # vertical stack
loaded_dfwords = pd.concat([loaded_dfwords, loaded_dfwords4], ignore_index=True)  # vertical stack
loaded_dfwords = pd.concat([loaded_dfwords, loaded_dfwords5], ignore_index=True)  # vertical stack
loaded_dfwords = pd.concat([loaded_dfwords, loaded_dfwords6], ignore_index=True)  # vertical stack
loaded_dfwords = pd.concat([loaded_dfwords, loaded_dfwords7], ignore_index=True)  # vertical stack
loaded_dfwords = pd.concat([loaded_dfwords, loaded_dfwords8], ignore_index=True)  # vertical stack
loaded_dfwords = pd.concat([loaded_dfwords, loaded_dfwords9], ignore_index=True)  # vertical stack
loaded_dfwords = pd.concat([loaded_dfwords, loaded_dfwords10], ignore_index=True)  # vertical stack
loaded_dfwords = pd.concat([loaded_dfwords, loaded_dfwords11], ignore_index=True)  # vertical stack

In [4]:
loaded_dfwords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227055 entries, 0 to 227054
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      227055 non-null  object
 1   text    227055 non-null  object
 2   image   227055 non-null  object
dtypes: object(3)
memory usage: 5.2+ MB


### 2. Cleaning the Data

In [5]:
import matplotlib.pyplot as plt

def show_image(row):
    plt.imshow(loaded_dfwords.iloc[row,2])
    plt.show()

In [6]:
import re

# Patter for all Special characters
special_char_pattern = r'[^a-zA-Z0-9\s]'  # Matches anything not alphanumeric or whitespace

# Select rows with special characters
special_char_rows = loaded_dfwords[loaded_dfwords['text'].str.contains(special_char_pattern, regex=True, na=False)]

In [7]:
special_char_rows.head()

Unnamed: 0,id,text,image
1,EHXbhtL,↳,"[[[187, 166, 139], [188, 167, 140], [190, 169,..."
3,EHXbhtL,↳,"[[[180, 161, 134], [181, 162, 135], [184, 163,..."
4,EHXbhtL,.,"[[[189, 167, 142], [189, 167, 142], [189, 167,..."
5,EHXbhtL,-bey,"[[[159, 144, 118], [159, 144, 118], [159, 144,..."
6,EHXbhtL,.,"[[[165, 151, 123], [164, 150, 122], [162, 148,..."


In [8]:
print(special_char_rows.loc[special_char_rows['id'] == 'A3b6IRB',])

            id text                                              image
17743  A3b6IRB   #2  [[[231, 222, 209], [230, 221, 208], [230, 220,...
17747  A3b6IRB   W/  [[[250, 232, 214], [251, 234, 215], [252, 235,...


In [9]:
print(loaded_dfwords.loc[loaded_dfwords['id'] == 'A3b6IRB',])

            id     text                                              image
17742  A3b6IRB  SURGERY  [[[227, 220, 209], [229, 220, 210], [230, 221,...
17743  A3b6IRB       #2  [[[231, 222, 209], [230, 221, 208], [230, 220,...
17744  A3b6IRB     N2O2  [[[249, 235, 213], [248, 234, 212], [248, 234,...
17745  A3b6IRB    CRASH  [[[102, 93, 133], [96, 87, 125], [91, 77, 118]...
17746  A3b6IRB     CART  [[[72, 68, 113], [72, 69, 109], [81, 76, 110],...
17747  A3b6IRB       W/  [[[250, 232, 214], [251, 234, 215], [252, 235,...
17748  A3b6IRB   SHARPS  [[[251, 233, 216], [251, 233, 216], [250, 232,...
17749  A3b6IRB    WHEEL  [[[170, 168, 192], [128, 126, 156], [112, 109,...
17750  A3b6IRB   CHAIRS  [[[245, 230, 214], [243, 227, 214], [242, 226,...
17751  A3b6IRB      SED  [[[121, 117, 154], [113, 109, 149], [103, 100,...
17752  A3b6IRB     ATED  [[[204, 199, 210], [188, 183, 204], [149, 145,...
17753  A3b6IRB  PATIENT  [[[158, 148, 174], [188, 176, 191], [196, 185,...
17754  A3b6IRB     EXIT  

In [10]:
allowed_pattern = r'^[\w\s\.,!?;:\-+*/=()\[\]{}<>@#\$%^&_\'"\t\n]+$'
mask = ~loaded_dfwords['text'].str.contains(allowed_pattern, regex=True)
non_standard_rows = loaded_dfwords[mask]

In [11]:
non_standard_rows.head()

Unnamed: 0,id,text,image
1,EHXbhtL,↳,"[[[187, 166, 139], [188, 167, 140], [190, 169,..."
3,EHXbhtL,↳,"[[[180, 161, 134], [181, 162, 135], [184, 163,..."
159,fbIRZCU,✓,"[[[178, 171, 162], [178, 171, 162], [178, 171,..."
448,17GNoPL,→,"[[[162, 188, 164], [162, 188, 164], [160, 186,..."
583,duRozwV,€,"[[[107, 107, 116], [109, 109, 118], [113, 115,..."


In [12]:
print(non_standard_rows.loc[non_standard_rows['id'] == 'sNpIWnz',])
print("\n\n\n")
print(non_standard_rows.loc[non_standard_rows['id'] == 'hhk8nvy',])

            id    text                                              image
17848  sNpIWnz    ·USA  [[[10, 9, 18], [9, 9, 18], [8, 10, 18], [10, 1...
17854  sNpIWnz  Japan|  [[[155, 153, 128], [166, 164, 138], [174, 175,...
17855  sNpIWnz  Japan|  [[[150, 153, 127], [128, 131, 106], [159, 162,...




            id          text  \
17949  hhk8nvy  4\/22\/2016.   

                                                   image  
17949  [[[169, 164, 165], [166, 164, 164], [166, 164,...  


In [13]:
mask = loaded_dfwords['text'].str.contains(r'\\', regex=True)
check_rows= loaded_dfwords[mask]

In [14]:
loaded_dfwords['text'] = loaded_dfwords['text'].str.replace('\\/', '/', regex=False)

In [15]:
mask = ~loaded_dfwords['text'].str.contains(allowed_pattern, regex=True)
non_standard_rows2 = loaded_dfwords[mask]

In [16]:
print("Words with special character:", len(non_standard_rows2), ", Percentage: ", len(non_standard_rows2)/len(loaded_dfwords))
print("Images with special charatcer:", len(non_standard_rows2['id'].unique()), ", Percentage: ", len(non_standard_rows2['id'].unique())/len(loaded_dfwords['id'].unique()))

Words with special character: 2004 , Percentage:  0.008826055361035872
Images with special charatcer: 565 , Percentage:  0.07076653306613226


In [17]:
loaded_dfwords=loaded_dfwords[~mask]

In [18]:
print("total number of words", len(loaded_dfwords))
print("total number of images", len(loaded_dfwords['id'].unique()))

total number of words 225051
total number of images 7977


In [19]:
# confirm there is no special characters
count_matching = loaded_dfwords['text'].str.contains(allowed_pattern, regex=True, na=False).sum()
print(f"Number of rows with allowed characters: {count_matching}")

Number of rows with allowed characters: 225051


In [20]:
pattern = r'^[^a-zA-Z0-9]+$'  # Matches strings with no alphanumeric chars at all
non_alnum_rows = loaded_dfwords[loaded_dfwords['text'].str.contains(pattern, regex=True, na=False)]

In [21]:
print("total number of words", len(loaded_dfwords))
print("total number of images", len(loaded_dfwords['id'].unique()))

total number of words 225051
total number of images 7977


In [22]:
non_alnum_rows.head(20)

Unnamed: 0,id,text,image
4,EHXbhtL,.,"[[[189, 167, 142], [189, 167, 142], [189, 167,..."
6,EHXbhtL,.,"[[[165, 151, 123], [164, 150, 122], [162, 148,..."
7,EHXbhtL,.,"[[[194, 170, 140], [192, 168, 138], [191, 167,..."
46,O91MhJ1,=>,"[[[135, 146, 148], [136, 148, 149], [137, 148,..."
68,O91MhJ1,+,"[[[86, 97, 143], [86, 96, 143], [83, 93, 139],..."
69,O91MhJ1,-,"[[[175, 173, 173], [173, 171, 171], [172, 170,..."
72,O91MhJ1,+,"[[[65, 77, 124], [96, 105, 148], [128, 137, 17..."
73,O91MhJ1,-,"[[[173, 167, 169], [173, 167, 168], [175, 170,..."
74,O91MhJ1,.,"[[[62, 90, 16], [69, 94, 21], [71, 96, 23], [7..."
75,O91MhJ1,.,"[[[65, 87, 14], [69, 92, 18], [67, 92, 16], [5..."


In [23]:
only_period_rows= loaded_dfwords[loaded_dfwords['text'] == '.']

In [24]:
loaded_dfwords = loaded_dfwords[loaded_dfwords['text'] != '.']

In [25]:
print("Words with only one period:", len(only_period_rows), ", Percentage: ", len(only_period_rows)/len(loaded_dfwords))
print("Images with special charatcer:", len(only_period_rows['id'].unique()), ", Percentage: ", len(only_period_rows['id'].unique())/len(loaded_dfwords['id'].unique()))

Words with only one period: 19567 , Percentage:  0.09522395904303985
Images with special charatcer: 3762 , Percentage:  0.48058252427184467


In [26]:
# check other rows that have ony characters
pattern = r'^[^a-zA-Z0-9]+$'  # Matches strings with no alphanumeric chars at all
non_alnum_rows2 = loaded_dfwords[loaded_dfwords['text'].str.contains(pattern, regex=True, na=False)]

In [27]:
non_alnum_rows2.head()

Unnamed: 0,id,text,image
46,O91MhJ1,=>,"[[[135, 146, 148], [136, 148, 149], [137, 148,..."
68,O91MhJ1,+,"[[[86, 97, 143], [86, 96, 143], [83, 93, 139],..."
69,O91MhJ1,-,"[[[175, 173, 173], [173, 171, 171], [172, 170,..."
72,O91MhJ1,+,"[[[65, 77, 124], [96, 105, 148], [128, 137, 17..."
73,O91MhJ1,-,"[[[173, 167, 169], [173, 167, 168], [175, 170,..."


### 3. Splitting the Data into Training and Testing Subsets

In [28]:
import numpy as np

# Get unique groups
unique_images = loaded_dfwords['id'].unique()


# Randomly select 10% for test 
np.random.seed(42)
test_images = np.random.choice(unique_images, 
                              size=int(len(unique_images)*0.2), 
                              replace=False)

In [29]:
test_df = loaded_dfwords[loaded_dfwords['id'].isin(test_images)]
training_df = loaded_dfwords[~loaded_dfwords['id'].isin(test_images)]

In [30]:
print("Words in Train Dataset:", len(training_df), ", Percentage: ", len(training_df)/len(loaded_dfwords))
print("Images in Train Dataset:", len(training_df['id'].unique()), ", Percentage: ", len(training_df['id'].unique())/len(loaded_dfwords['id'].unique()))

Words in Train Dataset: 165084 , Percentage:  0.8033910182787954
Images in Train Dataset: 6263 , Percentage:  0.8000766479305059


In [31]:
print("Words in Test Dataset:", len(test_df), ", Percentage: ", len(test_df)/len(loaded_dfwords))
print("Images in Test Dataset:", len(test_df['id'].unique()), ", Percentage: ", len(test_df['id'].unique())/len(loaded_dfwords['id'].unique()))

Words in Test Dataset: 40400 , Percentage:  0.19660898172120458
Images in Test Dataset: 1565 , Percentage:  0.19992335206949413


In [32]:
print("total number of words", len(loaded_dfwords))
print("total number of images", len(loaded_dfwords['id'].unique()))

total number of words 205484
total number of images 7828


### 4. Save Dataset

In [36]:
df_train_info = training_df.drop(columns=['image']).reset_index().rename(columns={'index': 'word_idx'})

In [37]:
df_test_info = test_df.drop(columns=['image']).reset_index().rename(columns={'index': 'word_idx'})

In [38]:
df_train_info.head()

Unnamed: 0,word_idx,id,text
0,2,EHXbhtL,co
1,5,EHXbhtL,-bey
2,8,O91MhJ1,NORTON'S
3,9,O91MhJ1,THEORY
4,10,O91MhJ1,A


In [39]:
df_test_info.head()

Unnamed: 0,word_idx,id,text
0,0,7u2pNft,4
1,274,FXwDU08,Funk
2,275,FXwDU08,adelic
3,276,FXwDU08,or
4,277,FXwDU08,Brain


In [41]:
df_train_info.to_csv('dataset_info/df_train_info.csv', index=False)  

In [42]:
df_test_info.to_csv('dataset_info/df_test_info.csv', index=False) 