# **INSTALLING THE REQUIRED LIBRARIES**

In [3]:
!nvidia-smi

Sun Apr 21 19:47:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0              26W / 250W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.28.0
    Uninstalling accelerate-0.28.0:
      Successfully uninstalled accelerate-0.28.0
Successfully installed accelerate-0.29.3
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install neattext

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl.metadata (12 kB)
Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [8]:
import torch

if torch.cuda.is_available():
    # Print the current device
    device = torch.device("cuda")
    print("Current Device:", device)

    num_gpus = torch.cuda.device_count()

    for i in range(num_gpus):
        print("Device {}: {}".format(i, torch.cuda.get_device_name(i)))
else:
    print("CUDA is not available. Using CPU.")

Current Device: cuda
Device 0: Tesla P100-PCIE-16GB


# **IMPORTING THE REQUIRED LIBRARIES**

In [9]:
import os
import re
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, KFold
import neattext.functions as nfx
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, InputExample, losses, SentencesDataset

In [10]:
dir(nfx)

['BTC_ADDRESS_REGEX',
 'CURRENCY_REGEX',
 'CURRENCY_SYMB_REGEX',
 'Counter',
 'DATE_REGEX',
 'EMAIL_REGEX',
 'EMOJI_REGEX',
 'HASTAG_REGEX',
 'MASTERCard_REGEX',
 'MD5_SHA_REGEX',
 'MOST_COMMON_PUNCT_REGEX',
 'NUMBERS_REGEX',
 'PHONE_REGEX',
 'PoBOX_REGEX',
 'SPECIAL_CHARACTERS_REGEX',
 'STOPWORDS',
 'STOPWORDS_de',
 'STOPWORDS_en',
 'STOPWORDS_es',
 'STOPWORDS_fr',
 'STOPWORDS_ru',
 'STOPWORDS_yo',
 'STREET_ADDRESS_REGEX',
 'TextFrame',
 'URL_PATTERN',
 'USER_HANDLES_REGEX',
 'VISACard_REGEX',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__generate_text',
 '__loader__',
 '__name__',
 '__numbers_dict',
 '__package__',
 '__spec__',
 '_lex_richness_herdan',
 '_lex_richness_maas_ttr',
 'clean_text',
 'defaultdict',
 'digit2words',
 'extract_btc_address',
 'extract_currencies',
 'extract_currency_symbols',
 'extract_dates',
 'extract_emails',
 'extract_emojis',
 'extract_hashtags',
 'extract_html_tags',
 'extract_mastercard_addr',
 'extract_md5sha',
 'extract_numbers',
 'extr

In [11]:
EPOCHS = 10
BATCH_SIZE = 32
MAX_LEN = 280

# **DATA LOADING**

In [12]:
train_data = pd.read_csv('/kaggle/input/smm4h-2024-task5-classification-of-tweets/SMM4H-2024-Task5-Training.tsv', sep="\t")
train_data

Unnamed: 0,tweet_id,text,label
0,1364778927105933315,"Psych Med Twitter, wondering if you can help. ...",0
1,1130719434526511104,@dmx_biographer @HellaChillAF I wouldn’t fuck ...,0
2,1184221813733253120,Harley is autistic.... she has the mental capa...,1
3,1435958387020341248,I just made my son cry before going to school ...,1
4,1112559025927999488,Yooooooo my dad really just said “I’d rather h...,0
...,...,...,...
7393,1232778727186223108,Just got all kinds of dirty looks for ordering...,1
7394,1174782065532780544,"anyways- if they did cause autism, i’d rather ...",0
7395,1052612940438728704,2/4 trouble at work and I could have lost my j...,0
7396,1243556662964862976,@LawyerChamber $LgiLder22 I have severe asthma...,1


In [13]:
print("The Shape of the Train Data : ",train_data.shape)

The Shape of the Train Data :  (7398, 3)


In [14]:
train_data['label'].value_counts()

label
0    5118
1    2280
Name: count, dtype: int64

In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7398 entries, 0 to 7397
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  7398 non-null   int64 
 1   text      7398 non-null   object
 2   label     7398 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 173.5+ KB


In [16]:
valid_data = pd.read_csv('/kaggle/input/smm4h-2024-task5-classification-of-tweets/SMM4H-2024-Task5-Validation.tsv', sep='\t')
valid_data

Unnamed: 0,tweet_id,text,label
0,1265323726570225669,Mom Finds Secret to Homeschooling her Autistic...,0
1,1254180961622900738,I hope our child has my calmness and NOT Andre...,0
2,800336385399885824,Blind Girl With Autism Mimics Whitney Houston ...,0
3,1107948318150152192,"@amyschumer watching #growing, my 2yo son has ...",1
4,1438320313511473154,@robbystarbuck @AmericanAir would rather a mot...,0
...,...,...,...
384,1479160193808273413,my 7yo is being tested for adhd and struggles ...,0
385,1084086161667252225,@attachedmrsL the first reply on the tweet tho...,0
386,1404612356819374080,There's a mom who recorded her autistic son &a...,0
387,1458807608052174856,@itsadollthxng Lmfao girl !!! I’m upset fr cau...,1


In [17]:
print("The Shape of the Validation Data is :",valid_data.shape)

The Shape of the Validation Data is : (389, 3)


In [18]:
valid_data['label'].value_counts()

label
0    254
1    135
Name: count, dtype: int64

In [19]:
valid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389 entries, 0 to 388
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  389 non-null    int64 
 1   text      389 non-null    object
 2   label     389 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 9.2+ KB


# **DATA PREPROCESSING**

We see the following anomolies in the tweets
1) There are emojis used in the tweets. </br>
2) The label count is not equal. </br>
3) Use of Hastags. </br>
4) Use of attherate symbol (Tag other users). </br>
5) Use of contractions (like yo-year old, can't - cannot, won't-wont etc...). but roberta is generally robust to contractions so no need of it.</br>

In [20]:
train_data['tweet'] = train_data['text'].apply(nfx.remove_userhandles)
valid_data['tweet'] = valid_data['text'].apply(nfx.remove_userhandles)
print(train_data[:1])
print(valid_data[:1])

              tweet_id                                               text  \
0  1364778927105933315  Psych Med Twitter, wondering if you can help. ...   

   label                                              tweet  
0      0  Psych Med Twitter, wondering if you can help. ...  
              tweet_id                                               text  \
0  1265323726570225669  Mom Finds Secret to Homeschooling her Autistic...   

   label                                              tweet  
0      0  Mom Finds Secret to Homeschooling her Autistic...  


In [21]:
train_data['tweet'] = train_data['tweet'].apply(nfx.remove_hashtags)
valid_data['tweet'] = valid_data['tweet'].apply(nfx.remove_hashtags)
print(train_data[:1])
print(valid_data[:1])

              tweet_id                                               text  \
0  1364778927105933315  Psych Med Twitter, wondering if you can help. ...   

   label                                              tweet  
0      0  Psych Med Twitter, wondering if you can help. ...  
              tweet_id                                               text  \
0  1265323726570225669  Mom Finds Secret to Homeschooling her Autistic...   

   label                                              tweet  
0      0  Mom Finds Secret to Homeschooling her Autistic...  


In [22]:
train_data['tweet'] = train_data['tweet'].apply(nfx.remove_emojis)
valid_data['tweet'] = valid_data['tweet'].apply(nfx.remove_emojis)
print(train_data[:1])
print(valid_data[:1])

              tweet_id                                               text  \
0  1364778927105933315  Psych Med Twitter, wondering if you can help. ...   

   label                                              tweet  
0      0  Psych Med Twitter, wondering if you can help. ...  
              tweet_id                                               text  \
0  1265323726570225669  Mom Finds Secret to Homeschooling her Autistic...   

   label                                              tweet  
0      0  Mom Finds Secret to Homeschooling her Autistic...  


In [23]:
train_data['tweet'] = train_data['tweet'].apply(nfx.remove_urls)
valid_data['tweet'] = valid_data['tweet'].apply(nfx.remove_urls)
print(train_data[:1])
print(valid_data[:1])

              tweet_id                                               text  \
0  1364778927105933315  Psych Med Twitter, wondering if you can help. ...   

   label                                              tweet  
0      0  Psych Med Twitter, wondering if you can help. ...  
              tweet_id                                               text  \
0  1265323726570225669  Mom Finds Secret to Homeschooling her Autistic...   

   label                                              tweet  
0      0  Mom Finds Secret to Homeschooling her Autistic...  


In [24]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    # Remove emojis using the pattern
    return emoji_pattern.sub(r'', text)

In [25]:
train_data['tweet'] = train_data['tweet'].apply(remove_emoji, '')
valid_data['tweet'] = valid_data['tweet'].apply(remove_emoji, '')
print(train_data[:1])
print(valid_data[:1])

              tweet_id                                               text  \
0  1364778927105933315  Psych Med Twitter, wondering if you can help. ...   

   label                                              tweet  
0      0  Psych Med Twitter, wondering if you can help. ...  
              tweet_id                                               text  \
0  1265323726570225669  Mom Finds Secret to Homeschooling her Autistic...   

   label                                              tweet  
0      0  Mom Finds Secret to Homeschooling her Autistic...  


  train_data['tweet'] = train_data['tweet'].apply(remove_emoji, '')
  valid_data['tweet'] = valid_data['tweet'].apply(remove_emoji, '')


In [26]:
url_pattern = r'https:\/\/t\.co\/\w+'

train_data['tweet'] = train_data['tweet'].apply(lambda x: re.sub(url_pattern, '', x))
valid_data['tweet'] = valid_data['tweet'].apply(lambda x: re.sub(url_pattern, '', x))

In [27]:
train_data = train_data.drop(['text'], axis=1)
valid_data = valid_data.drop(['text'], axis=1)
print(train_data[:1])
print(valid_data[:1])

              tweet_id  label  \
0  1364778927105933315      0   

                                               tweet  
0  Psych Med Twitter, wondering if you can help. ...  
              tweet_id  label  \
0  1265323726570225669      0   

                                               tweet  
0  Mom Finds Secret to Homeschooling her Autistic...  


In [31]:
train_data.reset_index(drop=True, inplace=True)
valid_data.reset_index(drop=True, inplace=True)

In [32]:
train_data.head(5)

Unnamed: 0,tweet_id,label,tweet
0,1364778927105933315,0,"Psych Med Twitter, wondering if you can help. ..."
1,1130719434526511104,0,I wouldn’t fuck rob with YOUR dick. He’s a...
2,1184221813733253120,1,Harley is autistic.... she has the mental capa...
3,1435958387020341248,1,I just made my son cry before going to school ...
4,1112559025927999488,0,Yooooooo my dad really just said “I’d rather h...


In [33]:
valid_data.head(5)

Unnamed: 0,tweet_id,label,tweet
0,1265323726570225669,0,Mom Finds Secret to Homeschooling her Autistic...
1,1254180961622900738,0,I hope our child has my calmness and NOT Andre...
2,800336385399885824,0,Blind Girl With Autism Mimics Whitney Houston ...
3,1107948318150152192,1,"watching my 2yo son has ASD, you just made..."
4,1438320313511473154,0,would rather a mother suffocate an asthmat...


In [34]:
new_train_data = pd.concat([train_data, valid_data], axis=0)
new_train_data.head(5)

Unnamed: 0,tweet_id,label,tweet
0,1364778927105933315,0,"Psych Med Twitter, wondering if you can help. ..."
1,1130719434526511104,0,I wouldn’t fuck rob with YOUR dick. He’s a...
2,1184221813733253120,1,Harley is autistic.... she has the mental capa...
3,1435958387020341248,1,I just made my son cry before going to school ...
4,1112559025927999488,0,Yooooooo my dad really just said “I’d rather h...


In [35]:
new_train_data.tail(5)

Unnamed: 0,tweet_id,label,tweet
384,1479160193808273413,0,my 7yo is being tested for adhd and struggles ...
385,1084086161667252225,0,"the first reply on the tweet though yes, I ..."
386,1404612356819374080,0,There's a mom who recorded her autistic son &a...
387,1458807608052174856,1,Lmfao girl !!! I’m upset fr cause jacelyn ha...
388,1299153969009184768,1,I have a child that faces more adver...


In [36]:
new_train_data['label'].value_counts()

label
0    5372
1    2415
Name: count, dtype: int64

In [37]:
print("The shape of the new train data is : ",new_train_data.shape)

The shape of the new train data is :  (7787, 3)


In [38]:
new_train_data.reset_index(drop=True, inplace=True)

In [39]:
new_train_data.head(5)

Unnamed: 0,tweet_id,label,tweet
0,1364778927105933315,0,"Psych Med Twitter, wondering if you can help. ..."
1,1130719434526511104,0,I wouldn’t fuck rob with YOUR dick. He’s a...
2,1184221813733253120,1,Harley is autistic.... she has the mental capa...
3,1435958387020341248,1,I just made my son cry before going to school ...
4,1112559025927999488,0,Yooooooo my dad really just said “I’d rather h...


In [40]:
new_train_data.tail(5)

Unnamed: 0,tweet_id,label,tweet
7782,1479160193808273413,0,my 7yo is being tested for adhd and struggles ...
7783,1084086161667252225,0,"the first reply on the tweet though yes, I ..."
7784,1404612356819374080,0,There's a mom who recorded her autistic son &a...
7785,1458807608052174856,1,Lmfao girl !!! I’m upset fr cause jacelyn ha...
7786,1299153969009184768,1,I have a child that faces more adver...


In [41]:
undersampler = RandomUnderSampler(random_state=42)

X_resampled, Y_resampled = undersampler.fit_resample(new_train_data[['tweet_id','tweet']], new_train_data['label'])
new_train_data = pd.DataFrame(X_resampled, columns=['tweet_id','tweet'])
new_train_data['label'] = Y_resampled

print(new_train_data['label'].value_counts())

label
0    2415
1    2415
Name: count, dtype: int64


In [42]:
new_train_data.reset_index(drop=True, inplace=True)

# **TRAIN THE MODEL**

In [43]:
model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [44]:
train_tweets = new_train_data['tweet'].tolist()
train_labels = new_train_data['label'].tolist()

In [45]:
tweet_embeddings = model.encode(train_tweets)
print("The shape of the embeddings ",tweet_embeddings.shape)

Batches:   0%|          | 0/151 [00:00<?, ?it/s]

The shape of the embeddings  (4830, 1024)


In [46]:
tweet_embeddings[0]

array([-0.06761356,  0.018038  ,  0.00926388, ..., -0.02568004,
       -0.00102117, -0.01665532], dtype=float32)

In [47]:
train_embeddings_tensor = torch.tensor(tweet_embeddings, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.float32)

In [48]:
train_labels_tensor = train_labels_tensor.unsqueeze(1)

In [49]:
input_size = 1024

In [149]:
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(128, 16)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(16, 2)
        self.dropout3 = nn.Dropout(0.2)
        self.fc4 = nn.Linear(2, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout3(x)
        x = torch.sigmoid(self.fc4(x))
        return x

In [150]:
model_1 = SimpleNN(input_size)
model_1.train()

SimpleNN(
  (fc1): Linear(in_features=1024, out_features=128, bias=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=128, out_features=16, bias=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=16, out_features=2, bias=True)
  (dropout3): Dropout(p=0.2, inplace=False)
  (fc4): Linear(in_features=2, out_features=1, bias=True)
)

In [151]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model_1.parameters(), lr=0.01)

In [152]:
for epoch in range(EPOCHS*30):
    outputs = model_1(train_embeddings_tensor)
    loss = criterion(outputs, train_labels_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{EPOCHS*30}], Loss: {loss.item():.4f}')

Epoch [10/300], Loss: 0.6814
Epoch [20/300], Loss: 0.6061
Epoch [30/300], Loss: 0.5762
Epoch [40/300], Loss: 0.5199
Epoch [50/300], Loss: 0.4708
Epoch [60/300], Loss: 0.4103
Epoch [70/300], Loss: 0.3424
Epoch [80/300], Loss: 0.2951
Epoch [90/300], Loss: 0.2445
Epoch [100/300], Loss: 0.2061
Epoch [110/300], Loss: 0.1914
Epoch [120/300], Loss: 0.1728
Epoch [130/300], Loss: 0.1744
Epoch [140/300], Loss: 0.1585
Epoch [150/300], Loss: 0.1589
Epoch [160/300], Loss: 0.1504
Epoch [170/300], Loss: 0.1501
Epoch [180/300], Loss: 0.1625
Epoch [190/300], Loss: 0.1481
Epoch [200/300], Loss: 0.1554
Epoch [210/300], Loss: 0.1493
Epoch [220/300], Loss: 0.1527
Epoch [230/300], Loss: 0.1431
Epoch [240/300], Loss: 0.1474
Epoch [250/300], Loss: 0.1420
Epoch [260/300], Loss: 0.1552
Epoch [270/300], Loss: 0.1458
Epoch [280/300], Loss: 0.1386
Epoch [290/300], Loss: 0.1501
Epoch [300/300], Loss: 0.1543


In [71]:
val_tweets = valid_data['tweet'].tolist()
val_labels = valid_data['label'].tolist()

In [59]:
tweet_embeddings_valid = model.encode(val_tweets)
print("The shape of the embeddings ",tweet_embeddings_valid.shape)

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

The shape of the embeddings  (389, 1024)


In [60]:
tweet_embeddings_valid[0]

array([ 0.00342326,  0.01814297, -0.03914887, ..., -0.04085455,
       -0.07115684,  0.01646657], dtype=float32)

In [62]:
valid_embeddings_tensor = torch.tensor(tweet_embeddings_valid, dtype=torch.float32)
valid_labels_tensor = torch.tensor(val_labels, dtype=torch.float32)

In [63]:
valid_labels_tensor = valid_labels_tensor.unsqueeze(1)

In [64]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [78]:
def evaluate_model(model, inputs, targets):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        outputs = model(inputs)
        predictions = torch.round(outputs).squeeze()
    
    f1 = f1_score(targets, predictions)
    precision = precision_score(targets, predictions)
    recall = recall_score(targets, predictions)
    
    return f1, precision, recall

In [153]:
f1, precision, recall = evaluate_model(model_1, valid_embeddings_tensor, valid_labels_tensor)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

F1 Score: 0.9375
Precision: 0.8823529411764706
Recall: 1.0


# **TEST DATA**

In [155]:
data = pd.read_csv("/kaggle/input/testdatasame/SMM4H-2024-Task5-Test-Unlabeled.tsv", sep="\t")
data

Unnamed: 0,tweet_id,text
0,1266009978743160832,@TeaSpillYT My 4 year old daughter has autism ...
1,1319141585666400257,"""Is He / She Distracted? Considerations When D..."
2,1321605464644296705,@al_c0h0lic Apparently it wasn’t about politic...
3,841289449124294656,It ain't easy but I will always be there for m...
4,1174713585135734784,Way to go @Kodileerocks !!! Big congratulation...
...,...,...
9995,1350119143333355521,@bradainsworth #IWould love a takeaway this ev...
9996,1473120270118445064,"Husband works from home, my son has asthma and..."
9997,917699673929986048,Should we give different media guidance for yo...
9998,1289241342921474049,"@petti_crocker @KillerMartinis Honestly, we ar..."


In [156]:
data['tweet'] = data['text'].apply(nfx.remove_userhandles)
data['tweet'] = data['tweet'].apply(nfx.remove_hashtags)
data['tweet'] = data['tweet'].apply(nfx.remove_emojis)
data['tweet'] = data['tweet'].apply(nfx.remove_urls)
data['tweet'] = data['tweet'].apply(remove_emoji,'')
data = data.drop(['text'], axis=1)
data.reset_index(drop=True, inplace=True)

  data['tweet'] = data['tweet'].apply(remove_emoji,'')


In [157]:
url_pattern = r'https:\/\/t\.co\/\w+'

data['tweet'] = data['tweet'].apply(lambda x: re.sub(url_pattern, '', x))

In [158]:
data.reset_index(drop=True, inplace=True)

In [159]:
data.head(5)

Unnamed: 0,tweet_id,tweet
0,1266009978743160832,My 4 year old daughter has autism and I woul...
1,1319141585666400257,"""Is He / She Distracted? Considerations When D..."
2,1321605464644296705,Apparently it wasn’t about politics at all. ...
3,841289449124294656,It ain't easy but I will always be there for m...
4,1174713585135734784,Way to go !!! Big congratulations to you. ...


In [160]:
data.tail(5)

Unnamed: 0,tweet_id,tweet
9995,1350119143333355521,love a takeaway this evening after a day h...
9996,1473120270118445064,"Husband works from home, my son has asthma and..."
9997,917699673929986048,Should we give different media guidance for yo...
9998,1289241342921474049,"Honestly, we are so happy with this thread..."
9999,1187237163382886400,Teach your children to respect others cos one ...


In [161]:
tweets = data.tweet

In [162]:
tweet_embeddings_test = model.encode(tweets)
print("The shape of the embeddings ",tweet_embeddings_test.shape)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

The shape of the embeddings  (10000, 1024)


In [163]:
tweet_embeddings_test[0]

array([-0.01299513,  0.03189591, -0.00862608, ..., -0.04800828,
       -0.02486485,  0.04094813], dtype=float32)

In [164]:
test_embeddings_tensor = torch.tensor(tweet_embeddings_test, dtype=torch.float32)

In [165]:
def evaluate_model_2(model, inputs):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        outputs = model(inputs)
        predictions = torch.round(outputs).squeeze()
    return predictions

In [166]:
predictions = evaluate_model_2(model_1, test_embeddings_tensor)

In [167]:
predictions

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [170]:
df = pd.DataFrame()

In [171]:
df = pd.DataFrame({'tweet_id': data['tweet_id'], 'label': predictions})
df.to_csv("prediction_task5.tsv", sep="\t", index=False)