In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/global-politics-new-4000/global political data new (1).csv
/kaggle/input/global-politics/global political data new.csv


In [2]:
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaModel, AdamW
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk


In [3]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Data Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    tokens = word_tokenize(text)  # Tokenize the text
    stop_words = set(stopwords.words('english'))  # Get English stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    preprocessed_text = ' '.join(filtered_tokens)  # Join tokens back into a string
    return preprocessed_text

In [5]:
# Load data
data = pd.read_csv("/kaggle/input/global-politics-new-4000/global political data new (1).csv")

data['tweets'] = data['tweets'].apply(preprocess_text)


In [6]:
# Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


In [7]:
#!pip install --upgrade transformers



In [8]:
# Initialize tokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [9]:
# Define CustomDataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = self.data.tweets.values
        self.targets = self.data.target.values
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

# Define Encoder-Decoder model
class EncoderDecoder(nn.Module):
    def __init__(self, num_layers, pretrained_model_name, num_classes, hidden_size):
        super(EncoderDecoder, self).__init__()
        self.encoders = nn.ModuleList([RobertaModel.from_pretrained(pretrained_model_name) for _ in range(num_layers)])
        self.decoders = nn.ModuleList([SentimentDecoder(self.encoders[i].config.hidden_size, hidden_size, num_classes)
                                        for i in range(num_layers)])
        self.drop = nn.Dropout(p=0.3)

    def forward(self, input_ids, attention_mask, token_type_ids):
        pooled_outputs = []
        for encoder in self.encoders:
            output = encoder(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            pooled_outputs.append(output.pooler_output)
        pooled_output = torch.cat(pooled_outputs, dim=1)  # Concatenate pooled outputs from all encoders
        pooled_output = self.drop(pooled_output)  # Apply dropout
        decoder_outputs = []
        for decoder in self.decoders:
            output, _ = decoder(pooled_output.unsqueeze(0), None)  # Pass pooled_output through each decoder
            decoder_outputs.append(output)
        return torch.stack(decoder_outputs).mean(dim=0)  # Average the decoder outputs from all layers


In [10]:
class SentimentDecoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SentimentDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output, hidden = self.gru(input, hidden)
        output = torch.log_softmax(self.out(output[0]), dim=1)
        return output, hidden

class StackedTransformer(nn.Module):
    def __init__(self, num_layers, pretrained_model_name, num_classes, hidden_size):
        super(StackedTransformer, self).__init__()
        self.num_layers = num_layers
        self.encoders = nn.ModuleList([RobertaModel.from_pretrained(pretrained_model_name) for _ in range(num_layers)])
        self.decoders = nn.ModuleList([SentimentDecoder(self.encoders[0].config.hidden_size, hidden_size, num_classes) for _ in range(num_layers)])
        self.drop = nn.Dropout(p=0.3)

    def forward(self, input_ids, attention_mask, token_type_ids):
        encoder_outputs = []
        for encoder in self.encoders:
            encoder_output = encoder(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            encoder_outputs.append(encoder_output.pooler_output)
        stacked_encoder_output = torch.stack(encoder_outputs, dim=1)
        
        decoder_outputs = []
        for decoder, encoder_output in zip(self.decoders, stacked_encoder_output.unbind(dim=1)):
            pooled_output = self.drop(encoder_output)
            decoder_output, _ = decoder(pooled_output.unsqueeze(0), None)
            decoder_outputs.append(decoder_output)
        stacked_decoder_output = torch.stack(decoder_outputs, dim=1)
        
        # Combine decoder outputs, e.g., by averaging or concatenation
        final_output = torch.mean(stacked_decoder_output, dim=1)
        
        return final_output.squeeze(0)


In [11]:
# Training parameters
MAX_LEN = 128
BATCH_SIZE = 16
NUM_CLASSES = 2
LEARNING_RATE = 2e-5
EPOCHS = 30
NUM_LAYERS = 4  # Number of encoder-decoder pairs
HIDDEN_SIZE = 512  # Hidden size of each decoder


In [12]:
# Usage
model = StackedTransformer(num_layers=4, pretrained_model_name='roberta-base', num_classes=NUM_CLASSES, hidden_size=512)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['robert

In [13]:
# Move model to appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [14]:

# Initialize optimizer and criterion
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()



In [15]:

# Create train and test datasets
train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [16]:
# Training loop
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['targets'].to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = correct / total
    print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {epoch_loss / len(train_loader)}, Accuracy: {accuracy}')


Epoch 1/30, Loss: 0.6570495637220757, Accuracy: 0.5971896955503513
Epoch 2/30, Loss: 0.5191864074669151, Accuracy: 0.7555620608899297
Epoch 3/30, Loss: 0.390770684057307, Accuracy: 0.8348946135831382
Epoch 4/30, Loss: 0.3294785905838291, Accuracy: 0.8665105386416861
Epoch 5/30, Loss: 0.27304428218750754, Accuracy: 0.900175644028103
Epoch 6/30, Loss: 0.21445917289390742, Accuracy: 0.9247658079625293
Epoch 7/30, Loss: 0.16892693020298938, Accuracy: 0.9417447306791569
Epoch 8/30, Loss: 0.14579015245191126, Accuracy: 0.9487704918032787
Epoch 9/30, Loss: 0.10696816187195271, Accuracy: 0.9625292740046838
Epoch 10/30, Loss: 0.08286511351650354, Accuracy: 0.969847775175644
Epoch 11/30, Loss: 0.062353569989214506, Accuracy: 0.9795081967213115
Epoch 12/30, Loss: 0.051820495944887954, Accuracy: 0.9815573770491803
Epoch 13/30, Loss: 0.043238260986709916, Accuracy: 0.9879976580796253
Epoch 14/30, Loss: 0.031742346910411647, Accuracy: 0.9909250585480094
Epoch 15/30, Loss: 0.030438396984506746, Accur

In [None]:
#Define the path where the model will be saved
model_save_path = "/kaggle/working/stacked_transformer_model.pth"

# Save the model
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

In [None]:
# Define the path where the model is saved
model_load_path = "/kaggle/working/stacked_transformer_model.pth"

# Recreate the model architecture
loaded_model = StackedTransformer(num_layers=NUM_LAYERS, pretrained_model_name='roberta-base', num_classes=NUM_CLASSES, hidden_size=HIDDEN_SIZE)

# Load the state dictionary into the model
loaded_model.load_state_dict(torch.load(model_load_path))
loaded_model = loaded_model.to(device)
print("Model loaded successfully")

In [None]:
loaded_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['targets'].to(device, dtype=torch.long)

        outputs = loaded_model(input_ids, attention_mask, token_type_ids)
        
        if outputs.dim() == 1:
            outputs = outputs.unsqueeze(0)
        
        if outputs.dim() == 2:
            _, predicted = torch.max(outputs, 1)
        else:
            raise ValueError(f'Unexpected output shape: {outputs.shape}')

        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = correct / total
    print(f'Accuracy: {accuracy}')


In [None]:
# Prediction function remains the same
def predict_sentiment(text, model, tokenizer, max_len):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0).to(device)
        attention_mask = torch.tensor(inputs['attention_mask']).unsqueeze(0).to(device)
        token_type_ids = torch.tensor(inputs["token_type_ids"]).unsqueeze(0).to(device)
        outputs = model(input_ids, attention_mask, token_type_ids)
        
        outputs = outputs.view(1, -1)
        probabilities = torch.softmax(outputs, dim=1)
        _, predicted_index = torch.max(probabilities, 1)
        predicted_label = "Positive" if predicted_index.item() == 1 else "Negative"
        return predicted_label

new_text = [
    "You have to go down to the hamlets without your entourage, hey. We'll show you... Many people who are in poverty... Not from now on, but for a long time... People have a vote bank. Come up with a rule that those above the age of 50 should not be made ministers. There are many knowledgeable and educated young people in Kerala who are capable of holding the wheel of governance... There is no need for any political legacy... We'll rule and show you. A poverty-free, waste-free Kerala. It's like it's got alfaham in Koren's hand now...A multi fungal virus in Kerala Poli- Trics...",
    "What is that Plan B, sir? Let the surprise come out, sir. Are you going to shave off everything you've taken away and leave the country? Or is it to be declared a bankrupt? Or is it that al-Kerala will establish the kingdom? With all this in mind, the NIA is building a massive office in Kochi. If there is a raid that started in Kochi today, it is the first step to demolish this Plan B... Beware...",
    "The people who witnessed the incident told the truth, so others understand that he was innocent."]

predicted_sentiments = []
for tweet in new_text:
    predicted_sentiment = predict_sentiment(tweet, loaded_model, tokenizer, MAX_LEN)
    predicted_sentiments.append(predicted_sentiment)

for tweet, sentiment in zip(new_text, predicted_sentiments):
    print(f"{tweet}  :  {sentiment}\n")

In [17]:
# Evaluation
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['targets'].to(device, dtype=torch.long)

        outputs = model(input_ids, attention_mask, token_type_ids)
        

        # Ensure outputs have the expected shape
        if outputs.dim() == 1:
            # Handle the case where outputs might be a single-dimension tensor
            outputs = outputs.unsqueeze(0)  # Add a batch dimension if needed
        
        if outputs.dim() == 2:
            _, predicted = torch.max(outputs, 1)
        else:
            raise ValueError(f'Unexpected output shape: {outputs.shape}')

        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = correct / total
    print(f'Accuracy: {accuracy}')

Accuracy: 0.8395784543325527


In [24]:
# Prediction function remains the same
def predict_sentiment(text, model, tokenizer, max_len):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0).to(device)
        attention_mask = torch.tensor(inputs['attention_mask']).unsqueeze(0).to(device)
        token_type_ids = torch.tensor(inputs["token_type_ids"]).unsqueeze(0).to(device)
        outputs = model(input_ids, attention_mask, token_type_ids)
        
        # Reshape outputs tensor
        outputs = outputs.view(1, -1)
        probabilities = torch.softmax(outputs, dim=1)
        _, predicted_index = torch.max(probabilities, 1)
        predicted_label = "Positive" if predicted_index.item() == 1 else "Negative"
        return predicted_label


new_text = [
    "* I'm not the smartest guy and i hardly know everything, but i have always been a keen student history and a follower of politics, particularly US politics for 30 years. What is chilling, and believe me i did get the chills while watching this video, is that I 100% agree with his analysis. He vocalized succinctly what has been causing me great anxiety deep down for a long time about US and global politics. An anxiety that has been affecting me in a way it has never affected me before. Too many people underestimate the very real potential threat Trump and his ilk pose to not only US, but also global stability. Quite frankly it is downright frightening. No joke. And all of it is the result of what the state of the U.S. political system, among other things, has been allowed to degrade to over the last few decades. Of course it was never perfect, and there have been many tense global events in the decades since WW2, but now we really are on a real precipice looking over the edge into the abyss like never before. Yes, some might call it 'Trump delusion syndrome', or that i am exaggerating, but honestly, deep down, after much self reflection....i don't think i am. Let's just hope the proverbial coin flip lands right side up, however that needs to play out.",
    "* Indian policy consider leader nation representative. Right Biden president represents country. Personal friendships room politics.",
    "* This needs to be a criminal offence. As does lying to the public if an MP or a news outlet.",
    "* Quite frankly it is downright frightening. No joke.",
    "* The people who witnessed the incident told the truth, so others understand that he was innocent.",
    "* It result policies party followed powerful party country become nothing Indian politics today. Only party good country people survive, whichever party is.",
    "* The very objective of the Sangh Parivar is to create communal riots across the country.......yet Satheesan and his gang don't understand!Babri Masjid has disappeared as a result of these people's silence! It is true that minorities should vote carefully when the election is near. Because if the Sunkis come to power, the country itself will be brought to Kavi! It will be dangerous! What the Chief Minister said is correct. What is the doubt about that.....",
    "* Yes the former president of the Philippines at one time jokingly said that he wanted the Philippines to become one of the provinces of china. Many Filipinos believe that if the dutertes will have a chance to regain power the Philippines will become province of china.",
    "* Too many people underestimate the very real potential threat Trump and his ilk pose to not only US, but also global stability. Quite frankly it is downright frightening. No joke. And all of it is the result of what the state of the U.S. political system, among other things, has been allowed to degrade to over the last few decades. Of course it was never perfect, and there have been many tense global events in the decades since WW2, but now we really are on a real precipice looking over the edge into the abyss like never before. Yes, some might call it 'Trump delusion syndrome', or that i am exaggerating, but honestly, deep down, after much self reflection....i don't think i am. Let's just hope the proverbial coin flip lands right side up, however that needs to play out.",
    "* The coordinated naval exercises in the South China Sea demonstrate unity and commitment to international norms, which is commendable. They uphold freedom of navigation, a vital principle for global trade and security. However, while these actions aim to deter unilateral aggression and promote stability, there's a risk of escalating tensions in the region. The involvement of multiple naval forces increases the potential for accidents and unintended conflicts. Despite the importance of regional partnerships for enhancing peace and security in the Asia-Pacific, the militarization of the area could hinder diplomatic efforts for peaceful resolutions. Therefore, while military readiness is essential, it must be balanced with diplomatic dialogue to prevent further escalation and ensure a peaceful resolution to the disputes in the South China Sea",
    "* The coordinated naval exercises in the South China Sea demonstrate unity and commitment to international norms, which is commendable.",
    "* They uphold freedom of navigation, a vital principle for global trade and security.",
    "* The actions aim to deter unilateral aggression and promote stability.", 
    "* There's a risk of escalating tensions in the region.",
    "* The potential for accidents and unintended conflicts are high due to involvement of multiple naval forces.",
    "* Concern about the hindrance of diplomatic efforts by the militarization of the area enhancing peace and security in the Asia-Pacific",
    "* The militarization of the area could hinder diplomatic efforts for peaceful resolutions ", 
    "* Military readiness is essential, it must be balanced with diplomatic dialogue to prevent further escalation and ensure a peaceful resolution to the disputes in the South China Sea",
    "* The government's new climate change policy demonstrates a willingness to address environmental issues, which is necessary.",
    "* Investing in renewable energy and setting emissions targets are positive moves.",
    "* The policy lacks the aggressive measures needed to mitigate the rapid effects of climate change.",
    "* The intention is commendable, the policy falls short in enforcing stringent regulations and providing clear pathways for significant, immediate reductions in greenhouse gases." ,
    "* The reliance on voluntary compliance from industries raises doubts about its effectiveness.",
    "* This could lead to continued environmental degradation and missed opportunities to avert more severe climate impacts.",
    "* The government's new climate change policy demonstrates a willingness to address environmental issues, which is necessary. Investing in renewable energy and setting emissions targets are positive moves. Nevertheless, the policy lacks the aggressive measures needed to mitigate the rapid effects of climate change. The reliance on voluntary compliance from industries raises doubts about its effectiveness. While the intention is commendable, the policy falls short in enforcing stringent regulations and providing clear pathways for significant, immediate reductions in greenhouse gases. This could lead to continued environmental degradation and missed opportunities to avert more severe climate impacts.",
    "* The unpredictability of Trump?  Really?  Simple.  He's gonna do what he says  he's gonna do, it will be in the best interest of the USA,  and whatever he does, it will have the focus of making America great!  Low gas prices, thriving economy, NO NEW WARS, peace treaties, fight the deep set special interests, finished boarder wall.. I WISH we had a politician with his fortitude here in Canada.  TRUMP 2024!!!"]
predicted_sentiments = []
for tweet in new_text:
    predicted_sentiment = predict_sentiment(tweet, model, tokenizer, MAX_LEN)
    predicted_sentiments.append(predicted_sentiment)

for tweet, sentiment in zip(new_text, predicted_sentiments):
    print(f"{tweet}  :  {sentiment}\n")

* I'm not the smartest guy and i hardly know everything, but i have always been a keen student history and a follower of politics, particularly US politics for 30 years. What is chilling, and believe me i did get the chills while watching this video, is that I 100% agree with his analysis. He vocalized succinctly what has been causing me great anxiety deep down for a long time about US and global politics. An anxiety that has been affecting me in a way it has never affected me before. Too many people underestimate the very real potential threat Trump and his ilk pose to not only US, but also global stability. Quite frankly it is downright frightening. No joke. And all of it is the result of what the state of the U.S. political system, among other things, has been allowed to degrade to over the last few decades. Of course it was never perfect, and there have been many tense global events in the decades since WW2, but now we really are on a real precipice looking over the edge into the a

In [20]:
print(accuracy)

0.8395784543325527
