In [30]:
import spacy 
import nltk
import re
import cv2
import pathlib
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [31]:
stopwords_path = '/home/rikathipal/Rikathi/Indo-ML/IndoML_Datathon_2022/stopwords.txt'
extracted_text_folder_path = '/home/rikathipal/Rikathi/Indo-ML/IndoML_Datathon_2022/text_data_validation'
tokenizer_path = '/home/rikathipal/Desktop/models/tokenizer.pickle'

In [38]:
def get_text_from_path(path):
    with open(path) as f:
        lines = f.readlines()
        lines  = ' '.join(lines)
        f.close()
    return lines

def get_corresponding_txtpath(img_path):
    '''path str is the path of the folder containing txt file'''
    path_str = extracted_text_folder_path
    path_str = path_str + '/' + img_path.split('/')[-1].split('.')[0] + '.txt'    
    return path_str

def get_file_paths_and_labels(img_data_root):
    img_paths = [str(path) for path in img_data_root.glob('**/*.tif')]
    text_paths = [get_corresponding_txtpath(str(path)) for path in img_paths]
    return img_paths, text_paths   

In [39]:
#Tokenize, Lemmatize, stopwords removal

def preprocess(text_string):
    preprocessed_string = re.sub(r'[^\w\s]','',text_string)
    preprocessed_string = preprocessed_string.replace('\n',' ')
    preprocessed_string = preprocessed_string.replace('_',' ')
    preprocessed_string = re.sub(' +', ' ', preprocessed_string)
    return preprocessed_string

nlp = spacy.load("en_core_web_sm" )
def get_stopwords(file_path):
    with open(file_path, "r") as fp:
        content = fp.read()
        stops = content.split("\n")
        stops = stops[:-1]
        fp.close()
        return stops

stops = get_stopwords(stopwords_path)

def normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)

normalize("counting playing the Home", lowercase=True, remove_stopwords=True)

'count play home'

In [40]:
batch_size = 128
auto = tf.data.experimental.AUTOTUNE

def dataframe_to_dataset(dataframe):
    d = {}
    embed_tensor = []
    for i in dataframe['texts_embedding']:
        embed_tensor.append(tf.convert_to_tensor(i))

    img_data_array = []
    for this_path in dataframe['img_paths']:
        image= cv2.imread(this_path, 0)
        image=cv2.resize(image, (224, 224))
        image=np.array(image)
        image = image.astype(float)
        image = np.stack((image,)*3, axis=-1)
        image = tf.image.per_image_standardization(image)
        img_data_array.append(tf.convert_to_tensor(image))

    d['texts_embedding'] = embed_tensor
    d['img_paths'] = img_data_array

    ds = tf.data.Dataset.from_tensor_slices(d)
    return ds

def preprocess_text_and_image(sample):
    image = sample["img_paths"]
    text = sample['texts_embedding']
    return {"image_inputs": image,  "text_inputs": text}
  

def prepare_dataset(dataframe, training = True):
    ds = dataframe_to_dataset(dataframe)
    if training:
        ds = ds.shuffle(len(df))
    ds = ds.map(lambda x: preprocess_text_and_image(x)).cache()
    ds = ds.batch(batch_size).prefetch(auto)
    return ds 

In [41]:
def test(model_path, data_path):
    # generates CSV contains id and labels  
    img_data_root = pathlib.Path(data_path)
    img_paths , text_paths =  get_file_paths_and_labels(img_data_root); 
    text_data = [get_text_from_path(path) for path in text_paths]
    
    df = pd.DataFrame(list(zip(text_paths, text_data, img_paths)),
               columns =['text_paths','texts', 'img_paths'])
    
    df['texts'] = [preprocess(str(this_text)) for this_text in df['texts']]
    df['texts'] = [normalize(this_text, lowercase=True, remove_stopwords=True) for this_text in df['texts']]
    
    with open(tokenizer_path, 'rb') as handle:
        tokenizer = pickle.load(handle)
        
    max_len = 500 
    
    test_sequence = tokenizer.texts_to_sequences(df['texts'])
    test_padded = pad_sequences(test_sequence, maxlen = max_len, truncating = "post", padding = "post" )

    test_tensor = [tf.convert_to_tensor(test_padded[i]) for i in range(test_padded.shape[0])]
    df['texts_embedding'] = test_tensor
    
    test_ds = prepare_dataset(df, False)
    
    loaded_model = keras.models.load_model(model_path)
    
    y_pred = loaded_model.predict(test_ds)
    y_pred = [np.argmax(i) for i in y_pred]
    
    img_ids = [s.split("/")[-1].split('.')[0] for s in df['img_paths']]
    
    submission_df = pd.DataFrame(list(zip(img_ids,y_pred)),columns = ['id','label'])
    return submission_df


In [42]:
model_path = "/home/rikathipal/Desktop/models/our_model.h5"
img_data_path = "/home/rikathipal/Rikathi/Indo-ML/IndoML_Datathon_2022/validation"
submission_df = test(model_path, img_data_path)


save_path = '/home/rikathipal/Desktop/models/submission.csv'
submission_df.to_csv(save_path)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


In [43]:
submission_df

Unnamed: 0,id,label
0,17888,10
1,17801,4
2,17891,12
3,17802,6
4,17889,12
...,...,...
895,18655,4
896,18656,11
897,18657,6
898,18658,15


# Testing


In [24]:
t_df = pd.read_csv('/home/rikathipal/Desktop/test_df.csv')
t_df = t_df.drop(['Unnamed: 0', 'texts_embedding'], axis = 1)
t_df.head()

Unnamed: 0,text_paths,texts,img_paths
0,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,hemere wearn er 1 haren 24100 nanas enh ae le ...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...
1,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,department health human service pilblic heath ...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...
2,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,betsy imginnttnen sara 20x8 st aig 20x ad 0,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...
3,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,bagel new yory turspay janc ry ms front page 1...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...
4,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,I 4 eben 2 tes ta wah photocopy complie us cop...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...


In [25]:
model_path = '/home/rikathipal/Desktop/models/our_model.h5'
out = my_test(model_path, t_df)

In [26]:
out

Unnamed: 0,id,label
0,6883,6
1,14094,0
2,9192,8
3,12529,9
4,14662,6
...,...,...
1595,13230,9
1596,13321,15
1597,2646,2
1598,6452,6


In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(test_ds_result, list(out['label']))

0.72875

In [12]:
df.head()

Unnamed: 0,text_paths,texts,img_paths,texts_embedding
0,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,2500023668 6961 le 930 nal shimlzan shit vinid...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,"(tf.Tensor(9630, shape=(), dtype=int32), tf.Te..."
1,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,780000347 780000317 site produce bw web,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,"(tf.Tensor(9679, shape=(), dtype=int32), tf.Te..."
2,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,persone seeger tee ss 200s 2000 rew atime hoa ...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,"(tf.Tensor(4622, shape=(), dtype=int32), tf.Te..."
3,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,1820 inhalation sidestream cigarette smoke acc...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,"(tf.Tensor(9694, shape=(), dtype=int32), tf.Te..."
4,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,medium release court find nhmrc act improperly...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,"(tf.Tensor(426, shape=(), dtype=int32), tf.Ten..."


In [184]:
for i in test_ds:
    tt = i['image_inputs']
    break

2022-10-06 20:01:27.250280: W tensorflow/core/kernels/data/cache_dataset_ops.cc:757] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [185]:
tt

<tf.Tensor: shape=(128, 224, 224, 3), dtype=float64, numpy=
array([[[[ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714],
         ...,
         [ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714]],

        [[ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714],
         ...,
         [ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714]],

        [[ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714],
         ...,
         [ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608714],
         [ 0.17608714,  0.17608714,  0.17608

In [112]:
import keras
from tensorflow.keras.optimizers import Adamax
loaded_model = keras.models.load_model('/home/rikathipal/Desktop/models/g_v_model.h5')
loaded_model.compile(Adamax(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [114]:
y_pred = loaded_model.predict(test_ds)
y_pred = [np.argmax(i) for i in y_pred]
print(y_pred)

[4, 4, 1, 6, 5, 4, 3, 10, 3, 8, 12, 8, 3, 3, 4, 12, 3, 3, 1, 8, 1, 5, 4, 5, 12, 3, 1, 6, 12, 1, 1, 1, 6, 8, 5, 4, 6, 3, 4, 1, 0, 3, 8, 1, 11, 13, 5, 5, 8, 5, 5, 12, 12, 14, 10, 8, 9, 6, 3, 1, 4, 12, 1, 12, 5, 3, 8, 0, 5, 5, 3, 6, 6, 4, 11, 4, 1, 4, 0, 14, 12, 9, 0, 8, 5, 5, 8, 15, 5, 0, 1, 4, 8, 8, 1, 1, 8, 8, 8, 8, 8, 3, 5, 3, 3, 3, 4, 12, 4, 15, 11, 3, 8, 3, 5, 1, 11, 6, 8, 7, 8, 5, 1, 5, 5, 1, 12, 1, 3, 6, 4, 5, 10, 8, 1, 3, 4, 3, 6, 11, 1, 9, 5, 5, 1, 6, 0, 9, 0, 5, 5, 1, 6, 13, 8, 5, 8, 10, 4, 12, 3, 1, 5, 2, 1, 3, 8, 1, 9, 8, 5, 1, 4, 5, 4, 6, 3, 1, 0, 5, 0, 12, 10, 1, 1, 4, 3, 9, 0, 10, 11, 12, 0, 12, 8, 1, 6, 1, 8, 3, 11, 3, 12, 3, 1, 8, 5, 8, 5, 9, 8, 4, 6, 5, 3, 0, 6, 0, 3, 10, 6, 8, 8, 8, 1, 1, 3, 5, 0, 12, 1, 10, 9, 11, 5, 3, 5, 4, 4, 8, 1, 9, 8, 6, 3, 3, 4, 3, 3, 1, 3, 11, 1, 8, 1, 3, 11, 6, 3, 8, 8, 3, 5, 14, 4, 4, 8, 3, 1, 6, 12, 5, 4, 1, 14, 5, 9, 6, 3, 11, 12, 9, 8, 4, 12, 4, 3, 4, 1, 1, 1, 3, 1, 4, 1, 3, 1, 1, 6, 4, 1, 5, 1, 12, 12, 10, 1, 8, 12, 8, 13, 11, 5, 10, 8, 

In [24]:
img_ids = [s.split("/")[-1].split('.')[0] for s in df['img_paths']]
img_ids[0]

'17888'

In [25]:
submission_df = pd.DataFrame(list(zip(img_ids,y_pred)),columns = ['id','label'])
submission_df

Unnamed: 0,id,label
0,17888,12
1,17801,4
2,17891,1
3,17802,6
4,17889,12
...,...,...
895,18655,13
896,18656,12
897,18657,6
898,18658,1


In [26]:
submission_df.to_csv('/home/rikathipal/Desktop/models/submission.csv')

In [None]:
# testings

In [186]:
t_df = pd.read_csv('/home/rikathipal/Rikathi/merged_df.csv')
t_df.head()

Unnamed: 0.1,Unnamed: 0,text_paths,texts,img_paths,data_label
0,0,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,r harritan company inc 2541 space road po box ...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,1
1,1,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,81180353,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,8
2,2,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,sgent k group ltd 209e4e9230 052200 844 job 82...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,13
3,3,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,ctrl 43987 1994 congress cell tis9ue research ...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,8
4,4,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,gasiasy gyihl lsvoahod tvionvnis 66e vsn slo 3...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,10


In [189]:
t_df['texts']

0        r harritan company inc 2541 space road po box ...
1                                                 81180353
2        sgent k group ltd 209e4e9230 052200 844 job 82...
3        ctrl 43987 1994 congress cell tis9ue research ...
4        gasiasy gyihl lsvoahod tvionvnis 66e vsn slo 3...
                               ...                        
15995    environment international vol 12 9939 1986 pri...
15996    e tabac reunie sa moaernestabasesi abrique di ...
15997                                           f99zelpz0z
15998    einean toon billing instruction wail vour invo...
15999    principal investigatoriprogram director charle...
Name: texts, Length: 16000, dtype: object

In [190]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(t_df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [191]:
test_df.texts = test_df.texts.astype(str)

In [192]:
max_len = 500 
# Fit the tokenizer
tokenizer = Tokenizer(num_words = 65000)
tokenizer.fit_on_texts(test_df['texts'])

# sequence the input corpus and add zero padding upto 500 word
test_sequence = tokenizer.texts_to_sequences(test_df['texts'])
test_padded = pad_sequences(test_sequence, maxlen = max_len, truncating = "post", padding = "post" )


test_tensor = [tf.convert_to_tensor(test_padded[i]) for i in range(test_padded.shape[0])]
test_df['texts_embedding'] = test_tensor 

In [193]:
test_df

Unnamed: 0.1,Unnamed: 0,text_paths,texts,img_paths,data_label,texts_embedding
12507,12507,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,hemere wearn er 1 haren 24100 nanas enh ae le ...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,6,"(tf.Tensor(14805, shape=(), dtype=int32), tf.T..."
4564,4564,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,department health human service pilblic heath ...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,0,"(tf.Tensor(57, shape=(), dtype=int32), tf.Tens..."
15043,15043,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,betsy imginnttnen sara 20x8 st aig 20x ad 0,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,8,"(tf.Tensor(6079, shape=(), dtype=int32), tf.Te..."
2714,2714,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,bagel new yory turspay janc ry ms front page 1...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,9,"(tf.Tensor(9694, shape=(), dtype=int32), tf.Te..."
5206,5206,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,I 4 eben 2 tes ta wah photocopy complie us cop...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,6,"(tf.Tensor(1, shape=(), dtype=int32), tf.Tenso..."
...,...,...,...,...,...,...
3545,3545,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,jstonauburn I journal 014256 ewtow metropolita...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,9,"(tf.Tensor(54766, shape=(), dtype=int32), tf.T..."
3733,3733,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,philip morris management corp interoffice corr...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,15,"(tf.Tensor(55, shape=(), dtype=int32), tf.Tens..."
7770,7770,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,accord analysis page 1of 1 jaalelelxl el ea pl...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,2,"(tf.Tensor(1303, shape=(), dtype=int32), tf.Te..."
11972,11972,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,1a orkid eben nn ae vggrath ibe lat bc beresio...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,6,"(tf.Tensor(1740, shape=(), dtype=int32), tf.Te..."


In [194]:
test_ds_labels = test_df['data_label']

In [195]:
t_df_wo_label = test_df.drop(['data_label', r'Unnamed: 0'], axis =1 )
t_df_wo_label.head()

Unnamed: 0,text_paths,texts,img_paths,texts_embedding
12507,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,hemere wearn er 1 haren 24100 nanas enh ae le ...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,"(tf.Tensor(14805, shape=(), dtype=int32), tf.T..."
4564,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,department health human service pilblic heath ...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,"(tf.Tensor(57, shape=(), dtype=int32), tf.Tens..."
15043,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,betsy imginnttnen sara 20x8 st aig 20x ad 0,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,"(tf.Tensor(6079, shape=(), dtype=int32), tf.Te..."
2714,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,bagel new yory turspay janc ry ms front page 1...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,"(tf.Tensor(9694, shape=(), dtype=int32), tf.Te..."
5206,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,I 4 eben 2 tes ta wah photocopy complie us cop...,/home/rikathipal/Rikathi/Indo-ML/IndoML_Datath...,"(tf.Tensor(1, shape=(), dtype=int32), tf.Tenso..."


In [196]:
test_ds = prepare_dataset(t_df_wo_label, False)

In [198]:
for i in test_ds:
    print(i); break

{'image_inputs': <tf.Tensor: shape=(128, 224, 224, 3), dtype=float64, numpy=
array([[[[0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028],
         ...,
         [0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028]],

        [[0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028],
         ...,
         [0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028]],

        [[0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028],
         ...,
         [0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028],
         [0.44550028, 0.44550028, 0.44550028]],

        ...,

        [[0.445

2022-10-06 20:02:23.911680: W tensorflow/core/kernels/data/cache_dataset_ops.cc:757] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [199]:
import keras
from tensorflow.keras.optimizers import Adamax
loaded_model = keras.models.load_model('/home/rikathipal/Desktop/models/g_v_model.h5')
loaded_model.compile(Adamax(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [200]:
y_pred = loaded_model.predict(test_ds)

In [201]:
y_pred = [np.argmax(i) for i in y_pred]
print(y_pred)

[2, 2, 8, 2, 6, 2, 2, 8, 14, 14, 2, 5, 8, 2, 2, 2, 5, 2, 8, 8, 5, 12, 8, 8, 2, 2, 2, 2, 8, 8, 2, 2, 2, 12, 2, 2, 14, 2, 2, 6, 2, 2, 8, 2, 8, 2, 8, 8, 6, 2, 2, 12, 2, 2, 8, 2, 5, 2, 2, 2, 5, 2, 2, 4, 2, 4, 2, 8, 2, 2, 8, 8, 2, 8, 2, 5, 8, 5, 5, 2, 8, 2, 2, 2, 8, 2, 11, 2, 2, 2, 8, 8, 2, 2, 8, 2, 2, 2, 8, 8, 2, 2, 8, 5, 2, 2, 2, 2, 14, 2, 2, 10, 8, 2, 2, 2, 10, 2, 5, 2, 12, 6, 8, 5, 14, 2, 6, 5, 2, 2, 2, 2, 5, 5, 2, 5, 2, 2, 8, 2, 8, 0, 2, 13, 2, 8, 2, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 2, 2, 14, 11, 2, 2, 2, 6, 5, 2, 2, 8, 2, 6, 8, 2, 2, 8, 2, 2, 2, 2, 13, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 5, 2, 8, 2, 5, 14, 2, 2, 8, 5, 2, 2, 2, 2, 8, 2, 5, 8, 2, 2, 2, 2, 2, 2, 14, 6, 2, 5, 5, 2, 8, 14, 5, 5, 8, 8, 2, 8, 8, 2, 2, 9, 2, 8, 5, 2, 8, 12, 5, 2, 13, 13, 2, 2, 12, 2, 2, 6, 2, 14, 14, 8, 2, 2, 8, 10, 5, 2, 2, 2, 2, 5, 5, 5, 2, 5, 8, 8, 5, 2, 6, 12, 2, 8, 2, 10, 2, 2, 2, 2, 2, 14, 14, 8, 2, 8, 2, 2, 5, 14, 12, 2, 2, 2, 4, 2, 2, 14, 12, 2, 2, 5, 2, 11, 5, 2, 13, 8, 8, 2, 8, 2, 13, 2, 2, 2, 0, 2, 2, 2,

In [203]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
accuracy_score(test_ds_labels, y_pred)

0.13625