In [None]:
!pip install transformers
!pip install torch

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.9 MB/s[0m eta [36m0:00:0

In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.nn import BCEWithLogitsLoss
from transformers import AdamW
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split



In [None]:
data = pd.read_csv('train_val.csv')

# Clean the tweet data
data['tweet'] = data['tweet'].str.replace('[^\w\s]', '')
data['tweet'] = data['tweet'].str.lower()

classes = ['unnecessary', 'mandatory', 'pharma', 'conspiracy', 'political', 'country', 'rushed', 'ingredients',
           'side-effect', 'ineffective', 'religious', 'none']

for cls in classes:
    data[cls] = 0

for i, row in data.iterrows():
    labels = row['labels'].split()
    for cls in labels:
        if cls in classes:
            data.at[i, cls] = 1

# Split the dataset into features and labels
X = data['tweet']

y = data[classes]

# Split the data into training, validation, and testing sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)



  data['tweet'] = data['tweet'].str.replace('[^\w\s]', '')


In [None]:
# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(classes))



Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize the input tweets
def tokenize_tweets(text_list):
    return tokenizer.batch_encode_plus(
        text_list,
        add_special_tokens=True,
        padding=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )



In [None]:
X_train_tokenized = tokenize_tweets(X_train.tolist())
X_val_tokenized = tokenize_tweets(X_val.tolist())
X_test_tokenized = tokenize_tweets(X_test.tolist())

y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
batch_size = 16

train_data = TensorDataset(X_train_tokenized['input_ids'], X_train_tokenized['attention_mask'], y_train_tensor)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = TensorDataset(X_val_tokenized['input_ids'], X_val_tokenized['attention_mask'], y_val_tensor)
val_loader = DataLoader(val_data, batch_size=batch_size)

test_data = TensorDataset(X_test_tokenized['input_ids'], X_test_tokenized['attention_mask'], y_test_tensor)
test_loader = DataLoader(test_data, batch_size=batch_size)





In [None]:
# Set up the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_function = BCEWithLogitsLoss()

# Training loop
epochs = 20

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)





RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Training Loss: {avg_train_loss:.4f}")



Epoch 1:  15%|█▍        | 64/434 [00:47<04:24,  1.40it/s]

In [None]:
# Evaluation on validation set
model.eval()
val_preds = []
val_labels = []
with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validation"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        val_preds.extend(torch.sigmoid(logits).cpu().detach().numpy())
        val_labels.extend(labels.cpu().detach().numpy())



In [None]:
# Threshold predictions to binary values (0 or 1)
val_preds = (torch.tensor(val_preds) >= 0.5).int().numpy()

print("Classification Report (Validation Set):")
print(classification_report(val_labels, val_preds, target_names=classes))


In [None]:
# Read the test.csv file
test_data = pd.read_csv('test.csv')
test_data

In [None]:
# Clean the tweet data
test_data['tweet'] = test_data['tweet'].str.replace('[^\w\s]', '')
test_data['tweet'] = test_data['tweet'].str.lower()

In [None]:
# Tokenize the input tweets
X_test_tokenized = tokenize_tweets(test_data['tweet'].tolist())

# Create a DataLoader for the test data
test_data = TensorDataset(X_test_tokenized['input_ids'], X_test_tokenized['attention_mask'])
test_loader = DataLoader(test_data, batch_size=batch_size)

In [None]:
# Predict the classes
model.eval()
predicted_classes = []
for batch in tqdm(test_loader, desc="Predicting"):
    input_ids, attention_mask = batch
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predicted_classes.extend(torch.sigmoid(logits).round().cpu().numpy())

In [None]:
# Create a new list to store the predicted classes in text format
predicted_classes_text = []

# Map the predicted classes to their corresponding class names
for row in predicted_classes:
    row_classes_text = []
    for i, cls in enumerate(classes):
        if row[i] == 1:
            row_classes_text.append(cls)
    predicted_classes_text.append(row_classes_text)


In [None]:

for i in range(len(predicted_classes_text)):
    if len(predicted_classes_text[i])==0:
      predicted_classes_text[i].append('none')
    else:
      pass

In [None]:
processed_data = [" ".join(sublist) if len(sublist) > 1 else sublist[0] for sublist in predicted_classes_text]


In [None]:
df = pd.DataFrame({"pred_classes": processed_data})
type(test_data)

In [None]:
test = pd.read_csv('test.csv')
merged_df = pd.merge(test,df, left_index=True, right_index=True)

In [None]:

merged_df = merged_df.drop(['tweet'],axis=1)

In [None]:
merged_df.to_csv('predictions.csv',index=False,sep=",")

In [None]:
merged_df.head()