In [1]:
import pandas as pd
import numpy as np


In [2]:
file2import = ['Dec-Feb_23', 'Mar-May_23', 'Jun-Aug_23', 'Sep-Nov_23', 'Dec-Feb_24', 'Mar-May_24']

path = '/content/drive/MyDrive/CRP/data/'

dataframes = []

for file_name in file2import:
    file_path = f'{path}{file_name}.csv'
    df = pd.read_csv(file_path)
    dataframes.append(df)

data = pd.concat(dataframes, ignore_index=True)

In [3]:
col4indicator = ['X Likes', 'X reposts', 'X followers', 'Impressions', 'Estimated reach']
col4sentiment = ['Date', 'Text','Tone']
data_f = data.loc[:, col4sentiment + col4indicator]

In [4]:
data_f.columns

Index(['Date', 'Text', 'Tone', 'X Likes', 'X reposts', 'X followers',
       'Impressions', 'Estimated reach'],
      dtype='object')

In [5]:
data_f.shape

(57936, 8)

In [6]:
data_f = data_f.drop_duplicates(subset=['Text'])

In [7]:
data_f.shape

(56454, 8)

In [8]:
!pip install urlextract

Collecting urlextract
  Downloading urlextract-1.9.0-py3-none-any.whl (21 kB)
Collecting uritools (from urlextract)
  Downloading uritools-4.0.3-py3-none-any.whl (10 kB)
Installing collected packages: uritools, urlextract
Successfully installed uritools-4.0.3 urlextract-1.9.0


In [9]:
from urlextract import URLExtract
extractor = URLExtract()

def remove_urls(text):
    if not isinstance(text, str):
      return text  # Return the original input if it's not a string
    urls = extractor.find_urls(text)
    for url in urls:
        text = text.replace(url, '')
    return text

data_f['Text'] = data_f['Text'].apply(remove_urls)

In [10]:
!pip install datasketch

Collecting datasketch
  Downloading datasketch-1.6.5-py3-none-any.whl (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: datasketch
Successfully installed datasketch-1.6.5


In [11]:
from datasketch import MinHash, MinHashLSH
import pandas as pd
import re

num_perm = 128
threshold = 0.90

lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

def remove_hashtags_and_mentions(text):
    return re.sub(r'[@#]\S+', '', text)

def create_minhash(text):
    m = MinHash(num_perm=num_perm)
    for d in text.split():
        m.update(d.encode('utf8'))
    return m

minhashes = {}
for idx, row in data_f.iterrows():
    clean_text = remove_hashtags_and_mentions(row['Text'])
    minhash = create_minhash(clean_text)
    lsh.insert(f"doc_{idx}", minhash)
    minhashes[idx] = minhash

to_remove = set()
for idx, minhash in minhashes.items():
    if f"doc_{idx}" not in to_remove:
        duplicates = lsh.query(minhash)
        for dup in duplicates:
            if dup != f"doc_{idx}":
                to_remove.add(dup)

indices_to_remove = {int(x.split('_')[1]) for x in to_remove}

data_f = data_f.drop(indices_to_remove)


In [12]:
data_f.shape

(43711, 8)

In [13]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model.to('cuda')

def tokenize_texts(texts, max_length=512):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

def custom_sentiment_analysis(texts, batch_size=32):
    model.eval()
    results = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenize_texts(batch_texts)
            inputs = {key: value.to('cuda') for key, value in inputs.items()}
            outputs = model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            results.extend(predictions.cpu().numpy())
    return results

results = custom_sentiment_analysis(data_f['Text'].tolist())

weights = {'negative': -1, 'neutral': 0, 'positive': 1}

compound_scores = []
sentiment_labels = []
labels = ['negative', 'neutral', 'positive']

for pred in results:
    compound_score = sum(pred[i] * weights[label] for i, label in enumerate(labels))
    compound_scores.append(compound_score)
    max_label = labels[pred.argmax()]
    sentiment_labels.append(max_label)

data_f['Compound_Sentiment_Score'] = compound_scores
data_f['Sentiment'] = sentiment_labels

data_f.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1366/1366 [05:48<00:00,  3.92it/s]


Unnamed: 0,Date,Text,Tone,X Likes,X reposts,X followers,Impressions,Estimated reach,Compound_Sentiment_Score,Sentiment
0,28/02/2023 23:38:05,Europe is increasingly burning coal and wood b...,neutral,0,0,3435,3435,274,-0.776049,negative
1,28/02/2023 23:34:50,A case to keep an eye on.,neutral,0,0,8043,8043,643,0.157533,neutral
2,28/02/2023 23:07:48,Relentless public mobilization is making the b...,negative,0,0,107,107,8,-0.626536,negative
3,28/02/2023 23:05:21,"We shall NOT say silent, we shall NOT just loo...",neutral,0,0,693,693,55,-0.611185,negative
4,28/02/2023 22:58:01,"Court in France has described as ""inadmissible...",negative,0,1,650,951,76,-0.599086,negative


In [14]:
senti_counts = data_f['Sentiment'].value_counts(normalize=True) * 100
print(senti_counts)

Sentiment
neutral     54.560637
negative    24.986845
positive    20.452518
Name: proportion, dtype: float64


In [15]:
tone_counts = data_f['Tone'].value_counts(normalize=True) * 100
print(tone_counts)

Tone
neutral     56.802635
negative    22.104276
positive    21.093089
Name: proportion, dtype: float64


In [16]:
data_f.to_csv('data_senti.csv')

In [17]:
data_f.columns

Index(['Date', 'Text', 'Tone', 'X Likes', 'X reposts', 'X followers',
       'Impressions', 'Estimated reach', 'Compound_Sentiment_Score',
       'Sentiment'],
      dtype='object')