In [1]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
nltk.download('punkt')


stop_words = stopwords.words('english')
prt = nltk.stem.PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vahan.yeghoyan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vahan.yeghoyan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
for dirname, _, filenames in os.walk("C:\\Users\\vahan.yeghoyan\\Desktop\\projects\\roBERTa\\data\\archive"):
    for filename in filenames:

        

        continue

In [3]:
def preprocess(document_path):
    with open(document_path, 'r', encoding='utf-8') as file:
        document = file.read()

    tokens = nltk.word_tokenize(document)
    tokens_pun_lower = [i.lower() for i in tokens if i.isalnum()]
    tokens_stop = [i for i in tokens_pun_lower if i not in stop_words]
    terms = [prt.stem(i) for i in tokens_stop]

    return " ".join(terms)

In [5]:
Data = []


for dirname, _, filenames in os.walk("C:\\Users\\vahan.yeghoyan\\Desktop\\projects\\roBERTa\\data\\archive"):
    for filename in filenames:
        doc_class = filename.split('_')[0].lower()
        doc_titles = filename
        documents = preprocess(os.path.join(dirname, filename))
        Data.append([doc_titles, documents, doc_class])


df = pd.DataFrame(Data, columns=['Title', 'Document', 'Class'])

In [6]:
df['Class'] = pd.factorize(df['Class'])[0]


df = df.drop("Title", axis=1)


train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


In [8]:
df['Class'] = pd.factorize(df['Class'])[0]
df = df.drop("Title", axis=1)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
def tokenize_text(texts, tokenizer, max_length):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [10]:
max_length=128
batch_size = 8

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=10)  


train_input_ids, train_attention_masks = tokenize_text(train_df['Document'].values, tokenizer, max_length)
val_input_ids, val_attention_masks = tokenize_text(val_df['Document'].values, tokenizer, max_length)





train_labels = torch.tensor(train_df['Class'].values)
val_labels = torch.tensor(val_df['Class'].values)



train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_data = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_loader = DataLoader(val_data, batch_size=batch_size)


optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
num_epochs = 5


for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids, attention_mask, labels

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()


    model.eval()
    val_accuracy = 0.0
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids, attention_mask, labels

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            val_accuracy += (predictions == labels).float().mean().item()

    val_accuracy /= len(val_loader)

    print(f"Epoch {epoch + 1}:")
    print(f"  Training Loss: {train_loss / len(train_loader)}")
    print(f"  Validation Accuracy: {val_accuracy}")


torch.save(model.state_dict(), "C:\\Users\\vahan.yeghoyan\\Desktop\\projects\\roBERTa\\data\\multi_label.pth")
print("Training complete!")





Epoch 1: 100%|███████████████████████████████████████████████████████████████████████| 100/100 [04:53<00:00,  2.94s/it]
Validation: 100%|██████████████████████████████████████████████████████████████████████| 25/25 [00:21<00:00,  1.14it/s]


Epoch 1:
  Training Loss: 1.279212231785059
  Validation Accuracy: 0.895


Epoch 2: 100%|███████████████████████████████████████████████████████████████████████| 100/100 [04:54<00:00,  2.95s/it]
Validation: 100%|██████████████████████████████████████████████████████████████████████| 25/25 [00:21<00:00,  1.19it/s]


Epoch 2:
  Training Loss: 0.2778268164396286
  Validation Accuracy: 0.95


Epoch 3: 100%|███████████████████████████████████████████████████████████████████████| 100/100 [04:40<00:00,  2.81s/it]
Validation: 100%|██████████████████████████████████████████████████████████████████████| 25/25 [00:20<00:00,  1.24it/s]


Epoch 3:
  Training Loss: 0.1282384243234992
  Validation Accuracy: 0.965


Epoch 4: 100%|███████████████████████████████████████████████████████████████████████| 100/100 [04:55<00:00,  2.95s/it]
Validation: 100%|██████████████████████████████████████████████████████████████████████| 25/25 [00:20<00:00,  1.23it/s]


Epoch 4:
  Training Loss: 0.0779253009893
  Validation Accuracy: 0.965


Epoch 5: 100%|███████████████████████████████████████████████████████████████████████| 100/100 [04:48<00:00,  2.88s/it]
Validation: 100%|██████████████████████████████████████████████████████████████████████| 25/25 [00:21<00:00,  1.18it/s]


Epoch 5:
  Training Loss: 0.043973140716552735
  Validation Accuracy: 0.96
Training complete!


In [14]:
def predict_class(text):

    inputs = tokenizer(text, return_tensors='pt', max_length=128, truncation=True, padding=True)


    with torch.no_grad():
        outputs = model(**inputs)


    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()


    sentiment_labels = ['business', 'entertainment','food', 'graphics','historical','medical', 'politics','space','sport','technologie']
    predicted_sentiment = sentiment_labels[predicted_label]

    return predicted_sentiment


text_to_analyze = """
Space, the vast expanse that extends beyond our planet, has been a source of wonder and curiosity for as long as humans have gazed up at the night sky. It is a realm that both inspires and challenges us, pushing the boundaries of our knowledge and understanding.

The cosmos is a place of immense scale and grandeur. It encompasses not only the countless stars that adorn the night sky but also galaxies, nebulae, black holes, and other celestial wonders. These celestial bodies and phenomena have captivated astronomers and space enthusiasts for centuries, driving scientific exploration and discovery.

The exploration of space has been one of humanity's greatest achievements. The journey began with early astronomers charting the movements of the stars and planets. It reached a milestone in the mid-20th century with the launch of the first artificial satellites and the Space Age, marked by the iconic moment when humans set foot on the Moon during the Apollo missions. Since then, space agencies and private companies have continued to push the boundaries of space exploration, launching probes to study distant planets, rovers to explore the surfaces of other worlds, and telescopes that unveil the mysteries of the cosmos.

Space exploration is not only about satisfying our curiosity; it also holds practical benefits for Earth and humanity. Satellites orbiting the planet provide critical data for weather forecasting, communications, navigation, and monitoring environmental changes. Moreover, the study of asteroids and comets is essential for understanding potential threats to our planet and developing strategies for planetary defense.

The search for extraterrestrial life is a compelling aspect of space exploration. Scientists are scouring distant planets and moons for signs of habitability and evidence of life. The discovery of microbial life on Mars or the existence of water on distant moons could revolutionize our understanding of life's potential beyond Earth.

Space also serves as a platform for international cooperation. The International Space Station (ISS) is a testament to the collaboration between nations, where astronauts from different countries live and work together in the vacuum of space. This peaceful cooperation in space stands as a symbol of what can be achieved when countries come together to address common challenges.

As we look to the future, space exploration holds immense promise. Missions to return to the Moon and establish a sustainable presence there are on the horizon, paving the way for future missions to Mars and beyond. Private companies are driving innovation in space travel, making the dream of space tourism and colonization of other planets closer to reality.

In conclusion, space is a realm of boundless wonder and opportunity. It fuels our imagination, drives scientific progress, and unites nations in a shared endeavor. Our exploration of the cosmos is a testament to human curiosity and determination, and it continues to inspire generations to reach for the stars.
"""


predicted_class = predict_class(text_to_analyze)
print("Predicted class:", predicted_class)

Predicted class: space
