This file serves to perform textblob sentiment analysis on the tweets, it was used in google colab to perform the analysis.

All tweets were in parquet dataframes stored on google drive(https://drive.google.com/drive/folders/18R9D_GcczG0cFTXnDdo-Ejy2ZuyhobKA?usp=sharing)

Files from original dataset were ungziped, csv files were read and then converted to parquet files in order to mantain better performance.
I used bulks of 10 dataframes to mantain a good performance and not to overload the memory.

Neutral values were not skipped, because on early stage they served no purpose, there were plenty of them, I suspect that this is due to the Textblob model characteristics.

In [None]:
import zipfile
import os
import gzip
import shutil
import pandas as pd
import ast
import numpy as np
import seaborn as sns
import json
import matplotlib.pyplot as plt
import textblob
from textblob import TextBlob
import nltk
import re
from tqdm import tqdm

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('movie_reviews')
nltk.download('stopwords')

In [None]:
def remove_hashtags_links_and_emojis(text):
    # Regular expression to match hashtags and links
    hashtag_pattern = r'#\S+'
    link_pattern = r'http\S+|www.\S+'
    
    # Unicode ranges for emojis
    emoji_pattern = r'[' \
                    u'\U0001F600-\U0001F64F' \
                    u'\U0001F300-\U0001F5FF' \
                    u'\U0001F680-\U0001F6FF' \
                    u'\U0001F1E0-\U0001F1FF' \
                    u'\U00002702-\U000027B0' \
                    u'\U000024C2-\U0001F251' \
                    ']+'
    
    # Combine all patterns
    combined_pattern = f'({hashtag_pattern})|({link_pattern})|({emoji_pattern})'
    
    # Replace hashtags, links, and emojis with an empty string
    return re.sub(combined_pattern, '', text)

In [None]:
list_of_data_sets = []

directory_to_dataset = './drive/MyDrive/Sentimental_Analysis/Ukraine_War_Data/UkraineWar'
for file in os.listdir(directory_to_dataset):
    if file.endswith('.parquet'):
        list_of_data_sets.append(file)

#sort list_of_data_sets
list_of_data_sets.sort()




n = len(list_of_data_sets)
print(list_of_data_sets)
print(n)

In [None]:
df = pd.DataFrame()
tqdm.pandas() # for progress_apply, lack of it causes error

k = 10 #number of datasets in one bulk
i = 0

for dataset in list_of_data_sets:
    i += 1
    data_set_dir = directory_to_dataset + "/" + dataset

    # Choosing only what we need from the dataset
    df_to_add = pd.read_parquet(data_set_dir, columns=["text", "extractedts", "language"])
    df_to_add = df_to_add[df_to_add["language"] == "en"]
    df_to_add = df_to_add[["text", "extractedts"]]
    df_to_add.dropna(inplace=True)
    df_to_add.loc[:,"text"] = df_to_add["text"].apply(lambda x: remove_hashtags_links_and_emojis(x)) # removing hashtags and links from texts

    df = pd.concat([df, df_to_add])
    print(f"Read {i} of {n} datasets, {round(i/n*100, 2)}%")


    if i % k == 0 or i == n:
        df.loc[:, "polarity"] = df["text"].progress_apply(lambda x: TextBlob(x).sentiment.polarity)
        df.loc[:, "subjectivity"] = df["text"].progress_apply(lambda x: TextBlob(x).sentiment.subjectivity)
        #remove neutral values
        df = df[df["polarity"] != 0]
        df = df[df["subjectivity"] != 0]
        df = df[["polarity", "subjectivity", "extractedts"]]

        df.to_parquet(f"./drive/MyDrive/Sentimental_Analysis/biggerdata_processed_{i}_iteration.parquet")
        # Clear the dataframe
        df = pd.DataFrame()
        print(f"Processed {i} of {n} datasets")

print("DONEDONEDONEDONEDONE")
