# Pandoro Gate Exposed: Sedding Light on Sentiments

In [None]:
#Importing dependencies
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [None]:
data_list=["Copertina_Espresso", "Facebook", "Instagram", "Quora", "Reddit","Twitter", "Youtube"]
dataframes=[]
for i in data_list:
    dataframe = pd.read_csv(f"/data/{i}.csv")
    dataframes.append(dataframe)
    display(dataframe.head(2))

In [None]:
df=pd.concat(dataframes, ignore_index=True)
df=df[["Source", "Username", "Title", "Text", "Date", "URL", "Post ID","Likes"]]

# EDA

In [None]:
display(df.sample(5))

In [None]:
df.info()

In [None]:
for i in df.columns:
    print(i, df[i].isnull().sum())

In [None]:
for i in df.columns:
    print(i, df[i].nunique())

In [None]:
#Setting colors to then plot the sources
colours={"Instagram":"#FF2084",
         "Youtube":"#FF0000",
         "Reddit":"#FF3600", 
         "Facebook":"#006EFF",
         "Quora":"#A80000",
         "Twitter":"#08003B"}

In [12]:
source_counts = df['Source'].value_counts()
colors = [colours[source] if source in colours else '#333333' for source in source_counts.index] 

fig = go.Figure(go.Bar(
    x=source_counts.index,
    y=source_counts.values,
    marker_color=colors
))

fig.update_layout(
    title='Data per Source',
    xaxis_title='Source',
    yaxis_title='Count',
    template='plotly_white'
)

fig.show()

# Preprocessing

In [None]:
df=pd.read_csv("Data.csv")

In [None]:
#Missing values in "Text" are relative to emojis not correctly recognized by the scraping algorithm
#Therefore, we will remove them from the dataset
df=df[df["Text"].notnull()]

df.info()

In [None]:
print(df.duplicated().sum(),df.shape)


In [None]:
df.drop_duplicates(inplace=True)
df.shape

In [None]:
import re
import emoji

In [None]:
#This fuction is to detect and extract emojis from the "Text" column
def detect_emoji(text):
    emoji_set = set()
    
    for character in text:
        if emoji.is_emoji(character):
            emoji_text = emoji.demojize(character, delimiters=("", " "))
            emoji_set.add(emoji_text)
    
    print(list(emoji_set))
    return list(emoji_set)

detect_emoji(df["Text"])

In [None]:
#Defining the dictionary for the italian abbreviatons that will be used later
abbreviations_italian = {
    "x": "per",
    "cmq": "comunque",
    "xk": "perché",
    "xké": "perché",
    "xkè": "perché",
    "xke": "perché",
    "xò": "però",
    "xo": "però",
    "ki": "chi",
    "nn": "non",
    "sl": "solo",
    "dv": "dove",
    "qnd": "quando",
    "grz": "grazie",
    "pke": "perché",
    "pké": "perché",
    "pkè": "perché",
    "anke": "anche",
    "qlc": "qualcosa",
    "qlcn": "qualcuno",
    "qlcs": "qualcosa",
    "dvt": "devi",
    "kz": "casa",
    "msg": "messaggio",
    "xche": "perché",
    "tvtb": "ti voglio tanto bene",
    "tvb": "ti voglio bene",
    "sn": "sono",
    "cm": "come",
    "pt": "più tardi",
    "tst": "testo",
    "bll": "bello",
    "brv": "bravo",
    "cos": "cosa",
    "d+": "di più",
    "+": "più",
    "dtt": "detto",
    "fll": "figlio",
    "frt": "forte",
    "ftt": "fatto",
    "grd": "guarda",
    "int": "intanto",
    "mlt": "molto",
    "mntr": "mentre",
    "pnt": "punto",
    "pr": "per",
    "prr": "parere",
    "ps": "posto",
    "pss": "passo",
    "ptt": "purtroppo",
    "qll": "quello",
    "scs": "scusa",
    "spt": "sopratutto",
    "stss": "stesso",
    "vbb": "va bene",
    "vdm": "vediamo",
    "vst": "vestito",
    "xsemp": "per sempre",
    "anke": "anche",
    "ok": "va bene",
    "asl": "età, sesso, località",
    "cmnq": "comunque",
    "nnn": "non",
    "cnn": "con",
    "xqst": "per questo",
    "grz": "grazie",
    "pz": "pezzo",
    "prq": "perché",
    "tnt": "tanto",
    "cmpl": "completo",
    "app": "appena",
    "fz": "faccia",
    "gg": "giorno",
    "slv": "salva",
    "cn": "con",
    "anke": "anche",
    "smp": "sempre",
    "già": "già",
    "gt": "giusto",
    "tnt": "tanto",
    "prpr": "proprio",
    "spt": "sopratutto",
    "nn": "non",
    "tb": "ti bacio",
    "tbr": "ti bacio, ti amo",
    "tlm": "ti lascio un messaggio",
    "ttp": "ti telefono più tardi",
    "xch": "perché",
    "adf": "a domani forse",
    "adp": "a dopo",
    "cmq": "comunque",
    "cn": "con",
    "cs": "cosa",
    "cvd": "ci vediamo dopo",
    "dnq": "dunque",
    "gl": "giocare",
    "mgl": "meglio",
    "np": "nessun problema",
    "omg": "Oh mio Dio",
    "pb": "probabilmente",
    "pq": "perché",
    "psr": "pensare",
    "qls": "qualsiasi",
    "qq": "qualche",
    "stf": "sul fatto",
    "tt": "tutto",
    "tvttb": "ti voglio tanto tanto bene",
    "vb": "va bene",
    "vds": "vedi",
    "wtf": "ma che cavolo",
    "xd": "per dire",
    "xm": "per me",
    "vrmt": "veramente",
    "xsm": "scusami",
    "scs":"scusa",
    "xv": "per voi",
    "anke": "anche",
    "cmplmnt": "complimenti",
    "fv": "favore",
    "xfv": "per favore",
    "perfa": "per favore",
    "llm": "lasciami un messaggio",
    "mnd": "manda",
    "nt": "notte",
    "pls": "per favore",
    "pvt": "privato",
    "qdm": "quando mi",
    "rfl": "riflettere",
    "snt": "sento",
    "tnx": "grazie",
    "ztl": "zona traffico limitato",
    "snc": "se non ci",
    "svt": "salvato",
    "cmpl": "completo",
    "grz": "grazie",
    "mtr": "meglio",
    "nmq": "non mi quitterò",
    "rqd": "richiedo",
    "slm": "salutami",
    "stfm": "sul fatto mio",
    "ztt": "zitto"
}

In [None]:
#Defining a dictionary to detect more emojis than the standard ones, this too will be later applied
emoji_italian = {
    "thumbs_down_medium-light_skin_tone": "emoji pollice verso il basso",
    'middle_finger': "emoji dito medio",
    'partying_face': "faccina festaiola",
    'yawning_face': "faccina che sbadiglia",
    'woman_facepalming': "emoji donna che si prende la testa",
    'heart_hands_light_skin_tone': "emoji mani a cuore",
    'chains': "emoji catene",
    'thumbs_down_medium-dark_skin_tone': "emoji pollice verso il basso",
    'heart_exclamation': "emoji cuore esclamativo",
    'orange_heart': "emoji cuore arancione",
    'rolling_on_the_floor_laughing': "faccina che ride a crepapelle",
    'see-no-evil_monkey': "emoji scimmia che non vede",
    'goat': "emoji capra",
    'flexed_biceps': "emoji bicipite flesso",
    'clapper_board': "emoji ciak",
    'face_holding_back_tears': "faccina che trattiene le lacrime",
    'zipper-mouth_face': "faccina con bocca chiusa da zip",
    'face_vomiting': "faccina che vomita",
    'ewe': "emoji pecora",
    'pensive_face': "faccina pensierosa",
    'mending_heart': "emoji cuore che guarisce",
    'face_with_rolling_eyes': "faccina con occhi al cielo",
    'mechanical_arm': "emoji braccio meccanico",
    'pile_of_poo': "emoji cacca",
    'lying_face': "faccina che mente",
    'cactus': "emoji cactus",
    'face_with_steam_from_nose': "faccina che sbuffa",
    'person_facepalming_light_skin_tone': "emoji persona che si prende la testa",
    'heart_suit': "emoji seme di cuori",
    'face_with_symbols_on_mouth': "faccina arrabbiata che lancia insulti",
    'green_heart': "emoji cuore verde",
    'clown_face': "faccina di clown",
    'sun': "emoji sole",
    'face_with_hand_over_mouth': "faccina con mano sulla bocca",
    'purple_heart': "emoji cuore viola",
    'enraged_face': "faccina infuriata",
    'grinning_face_with_sweat': "faccina sorridente imbarazzata",
    'face_with_tears_of_joy': "faccina che piange dalle risate",
    'middle_finger_light_skin_tone': "emoji dito medio",
    'white_heart': "emoji cuore bianco",
    'pink_heart': "emoji cuore rosa",
    'skull': "emoji teschio",
    'face_with_crossed-out_eyes': "faccina con occhi barrati",
    'cross_mark': "emoji croce di negazione",
    'fire': "emoji fuoco",
    'Italy': "emoji Italia",
    'crying_face': "faccina piangente",
    'woman_white_hair': "emoji donna con capelli bianchi",
    'smiling_face_with_open_hands': "faccina sorridente con mani aperte",
    'face_with_raised_eyebrow': "faccina irritata con sopracciglio alzato",
    'smiling_face_with_hearts': "faccina sorridente con cuori",
    'yellow_heart': "emoji cuore giallo",
    'TOP_arrow': "emoji freccia in alto",
    'thumbs_down': "emoji pollice verso il basso",
    'loudly_crying_face': "faccina che piange forte",
    'brown_heart': "emoji cuore marrone",
    'folded_hands_medium-light_skin_tone': "emoji mani giunte",
    "folded_hands_light_skin_tone":"emoji mani giunte",
    'thumbs_up': "emoji pollice in su",
    'shushing_face': "faccina che fa shh",
    'money-mouth_face': "faccina con bocca a forma di soldi",
    'blue_heart': "emoji cuore blu",
    'nauseated_face': "faccina nauseata",
    'face_savoring_food': "faccina che gusta il cibo",
    'bottle_with_popping_cork': "emoji bottiglia con tappo che salta",
    'heart_hands_medium_skin_tone': "emoji mani a cuore",
    'person_facepalming': "emoji persona che si prende la testa",
    'heart_decoration': "emoji decorazione a cuore",
    'airplane': "emoji aereo",
    'water_closet': "emoji WC",
    'flexed_biceps_medium-light_skin_tone': "emoji bicipite flesso",
    'raising_hands': "emoji mani alzate",
    'smiling_face_with_heart-eyes': "faccina sorridente con occhi a cuore",
    'toilet': "emoji toilette",
    'face_with_open_mouth': "faccina sorpresa con bocca aperta",
    'beaming_face_with_smiling_eyes': "faccina raggiante con occhi sorridenti",
    'airplane_departure': "emoji partenza aereo",
    'black_heart': "emoji cuore nero",
    'circus_tent': "emoji tenda da circo",
    'sparkling_heart': "emoji cuore scintillante",
    'END_arrow': "emoji freccia finale",
    'clapping_hands': "emoji mani che applaudono",
    'smiling_face': "faccina sorridente",
    'broken_heart': "emoji cuore spezzato",
    'folded_hands': "emoji mani giunte",
    'star-struck': "faccina con stelle negli occhi",
    'trophy': "emoji trofeo",
    'woman_facepalming_light_skin_tone': "emoji donna che si prende la testa",
    'red_heart': "emoji cuore rosso",
    "worriedface":"faccina preoccupata",
    "clapping_hands_light_skin_tone": "emoji applauso",
    "clapping_hands_medium-light_skin_tone": "emoji applauso",
    "pinched_fingers_light_skin_tone":"emoji mani che fanno ma che cavolo",
    "heart_with_arrow":"emoji cuore con la freccia",
    "raised_fist_light_skin_tone": "emoji pugno alzato", 
    "sweat_droplets":"emoji gocce di sudore",
    "crossed_fingers":"emoji dita incrociate",
    "raised_back_of_hand_medium-light_skin_tone":"emoji mano che fa stop",
    "drooling_face": "faccina che sbava",
    "thumbs_down_light_skin_tone": "emoji pollice verso il basso tonalità pelle chiara",
    "woman_facepalming_light_skin_tone": "emoji donna che si prende la testa tonalità pelle chiara",
    "woman_white_hair": "emoji donna con capelli bianchi",
    "circus_tent": "emoji tenda da circo",
    "star-struck": "faccina con stelle negli occhi",
    "airplane_departure": "emoji partenza aereo",
    "thumbs_down_medium-light_skin_tone": "emoji pollice verso il basso tonalità pelle media chiara",
    "raising_hands": "emoji mani alzate",
    "mechanical_arm": "emoji braccio meccanico",
    "face_with_raised_eyebrow": "faccina irritata con sopracciglio alzato",
    "face_with_hand_over_mouth": "faccina con mano sulla bocca",
    "face_with_open_mouth": "faccina sorpresa con bocca aperta",
    "bottle_with_popping_cork": "emoji bottiglia con tappo che salta",
    "face_with_steam_from_nose": "faccina che sbuffa",
    "face_with_symbols_on_mouth": "faccina arrabbiata che lancia insulti",
    "face_with_crossed-out_eyes": "faccina con occhi barrati",
    "face_with_rolling_eyes": "faccina con occhi al cielo",
    "face_savoring_food": "faccina che gusta il cibo",
    "face_holding_back_tears": "faccina che trattiene le lacrime",
    "enraged_face": "faccina infuriata",
    "smiling_face_with_open_hands": "faccina sorridente con mani aperte",
    "beaming_face_with_smiling_eyes": "faccina raggiante con occhi sorridenti",
    "grinning_face_with_sweat": "faccina sorridente imbarazzata",
    "lying_face": "faccina che mente",
    "face_vomiting": "faccina che vomita",
    "nauseated_face": "faccina nauseata",
    "zipper-mouth_face": "faccina con bocca chiusa da zip",
    "smiling_face_with_hearts": "faccina sorridente con cuori",
    "smiling_face": "faccina sorridente",
    "clown_face": "faccina di clown",
    "money-mouth_face": "faccina con bocca a forma di soldi",
    "shushing_face": "faccina che fa shh"
}

In [None]:
#Fuction that tokenizes and clean data
def preprocess_text(text):

    text = text.replace("'"," ") 
    #Demojize before tokenizing to preserve the integrity of compound emojis
    text = emoji.demojize(text, delimiters=(" ", " "))

    tokens = text.split()
    processed_tokens = []
    
    for token in tokens:
        if token in emoji_italian:
            processed_tokens.append(emoji_italian[token]) #Using the dictionary identified before
        elif token in abbreviations_italian:
            processed_tokens.append(abbreviations_italian[token]) #Using the dictionary identified before
        else:
            processed_tokens.append(token)
    
    #Reconstruct the text from processed tokens
    text = ' '.join(processed_tokens)
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  #URLs removal
    text = re.sub(r'<.*?>', '', text)  #HTML tags removal
    text = re.sub(r'[^a-z0-9\sàèìòùáéíóúâêîôûäëïöüãõñç]', ' ', text)  #Undesired character removal
    text = re.sub(r'\s+', ' ', text)  #Multiple spaces removal

    return text

df['Cleaned text'] = df['Text'].apply(preprocess_text)

In [None]:
from transformers import pipeline
import deepl

#Inotialize the language detector
language_detector = pipeline('text-classification', model='papluca/xlm-roberta-base-language-detection')

auth_key = "your_auth_key"
translator = deepl.Translator(auth_key)

In [None]:
#This function detects the language of the input text using the language detector pipeline
def detect_language(text):
    if not text.strip():
        return "undefined"
    try:
        text = text[:512] #This limit is specified in order to avoid exceding the model input size
        lang = language_detector(text)[0]['label']
        return lang
    except Exception as e:
        print(f"Error while detecting language: {e}")
        return ""
        
language = df['Cleaned text'].apply(detect_language)
df.insert(3, "Language", language)

display(df.sample(5, random_state=50))

In [None]:
df["Language"].unique()

In [None]:
source_counts = df["Language"].value_counts()

fig = go.Figure(go.Bar(
    x=source_counts.index,
    y=source_counts.values,
))

fig.update_layout(
    title='Data per Language',
    xaxis_title='Language',
    yaxis_title='Count',
    template='plotly_white'
)

fig.show()

In [None]:
#Creating a function to translate the non-Italian input text in Italian
def translate_text(text, lang):

    if lang != 'it':
        try:

            segments = [text[i:i+500] for i in range(0, len(text), 500)] #To fit the DeepL API limits
            translated_segments = [translator.translate_text(segment, target_lang='IT').text for segment in segments]
            return ' '.join(translated_segments)
        except Exception as e:
            print(f"Error while translating text: {e}")
            return text
    else:
        return text

df["Cleaned text"] = df.apply(lambda row: translate_text(row['Cleaned text'], row['Language']), axis=1)

In [None]:
#Funtion to remove "@" within the username
def remove_at(text):
    if text.startswith("@"):
        return text[1:]
    else:
        return text

df["Username"] = df["Username"].apply(remove_at)


In [None]:
display(df.sample(5, random_state=50))

In [None]:
#This function calculates the average date from a series of dates
def average_date(dates):
    dates = pd.to_datetime(dates.dropna(), errors='coerce').dt.normalize()
    if not dates.empty:
        min_date = dates.min()
        differences = (dates - min_date).dt.days
        mean_difference = np.mean(differences)
        average_date = min_date + pd.Timedelta(days=mean_difference)
        threshold_date = pd.Timestamp('2023-09-19')
        if average_date < threshold_date:
            return threshold_date.strftime('%Y-%m-%d')
        return average_date.strftime('%Y-%m-%d')
    
    return np.nan

#The following lines define the logics for aggregate different text in our dataset
aggregations = {
    "Language": lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan,
    'Text': lambda x: '.\n '.join(x.dropna().astype(str)) + ('.' if not x.dropna().empty else ''),
    'Cleaned text': lambda x: '.\n '.join(x.dropna().astype(str)) + ('.' if not x.dropna().empty else ''),
    'Title': lambda x: '.\n '.join(x.dropna().unique()) + ('.' if not x.dropna().empty else ''),
    'Date': average_date,
    'URL': lambda x: '.\n '.join(x.dropna().unique()) + ('.' if not x.dropna().empty else ''),
    'Likes': lambda x: x.fillna(0).sum(),
}

df = df.groupby(['Source', 'Username']).agg(aggregations).reset_index()

display(df.sample(5, random_state=50))

In [None]:
df=df[df["Cleaned text"].notnull()]
#df.to_csv('df_preprocessed.csv', index=False) --already saved

# Model

In [None]:
df=pd.read_csv('df_preprocessed.csv')

In [None]:
df.rename(columns={"Text": "Raw Text", "Cleaned text": "Text"}, inplace=True)
display(df.sample(5))

In [None]:
#Importing dependencies
import nltk
from nltk.corpus import stopwords
from sklearn.cluster import KMeans

In [None]:
nltk.download('stopwords')
italian_stopwords = set(stopwords.words('italian'))
#updating the default stopwords with ours
italian_stopwords.update(["a", "ad", "al", "allo", "ai", "agli", "all'","all", "alla", "alle", "con", "col", "coi", "da", "dal", "dallo", "dai", "dagli", "dall'","dall", "dalla", "dalle", "di", "del", "dello", "dei", "degli", "dell'","dell", "della", "delle", "in", "nel", "nello", "nei", "negli", "nell'", "nell", "nella", "nelle", "per", "per il", "per lo", "per i", "per gli", "per l'", "per l", "per la", "per le", "su", "sul", "sullo", "sui", "sugli", "sull'", "sull", "sulla", "sulle", "tra", "fra", "e", "o", "ed", "ma", "se", "perché", "perche", "perchè", "anche", "come", "dove", "quando", "mentre", "dopo", "prima", "poiché", "mentre", "anche", "anzi", "cioè", "cioe", "infatti", "dunque", "allora", "quindi", "perciò", "percio", "pure", "oppure", "né", "ne", "nè", "o", "che", "quanto", "quasi", "senza", "via", "verso", "il", "lo", "la", "i", "gli", "le", "l'","l", "un", "uno", "una", "alcuni", "alcuno", "qualche", "ciascuno", "nessuno", "ogni", "tutto", "tutta", "tutti", "tutte", "questo", "questa", "questi", "queste", "quello", "quella", "quelli", "quelle", "chi", "cui", "qual", "quale", "quali", "quanta", "quante", "così", "ecc", "etc", "oh", "eh", "beh", "mah", "sì", "no", "ne", "ci", "vi", "li", "si", "alcunché", ",", ".", ";", "!", "?", ":", "se", "emoji", "faccina"])

## Keyword Extraction

In [None]:
#importing dependencies
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
#Loading the pre-trained model for Keyword extraction
model_name = 'mrm8488/bert-italian-finedtuned-squadv1-it-alfa'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in italian_stopwords])

#This function tokenizes the text and encode it to obtain input tensor to feed the model
def get_word_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    return last_hidden_states.squeeze(0)

def split_text_into_segments(text, max_length=510): #max fixed to not exceed the limit of the model
    tokens = tokenizer.tokenize(text)
    return [tokenizer.convert_tokens_to_string(tokens[i:i + max_length]) for i in range(0, len(tokens), max_length)]

def extract_keywords(embeddings, words, num_clusters=5):
    if len(words) < num_clusters:
        num_clusters = len(words)
    if num_clusters == 0:
        return []
    #Apply KMeans clustering to the embeddings and extract keywords based on the clusters
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings.detach().numpy())
    distances = kmeans.transform(embeddings.detach().numpy())
    keywords_indices = distances.argmin(axis=0)
    keywords = [words[i] for i in keywords_indices if i < len(words)]
    return keywords

#To extract keywords from the given text, this contains also the other functions
def extract_keywords_from_text(text):
    segments = split_text_into_segments(text)
    all_keywords = []
    for segment in segments:
        cleaned_text = remove_stopwords(segment)
        embeddings = get_word_embeddings(cleaned_text)
        words = cleaned_text.split()
        keywords = extract_keywords(embeddings, words)
        all_keywords.extend(keywords)

    return list(dict.fromkeys(all_keywords))

In [None]:
df["Keywords"]= df['Text'].apply(extract_keywords_from_text)

display(df.head(5))

In [None]:
display(df["Keywords"].sample(10, random_state=50))
#df.to_csv('df_with_keywords.csv', index=False) --already saved

In [None]:
from collections import Counter

#To print the frequencies of the words in a descending order
def print_word_frequencies(df, column_name):

    if column_name in df.columns:
        all_words = sum(df[column_name].dropna(), [])
        word_counts = Counter(all_words)
        for word, count in word_counts.most_common():
            print(f"{word}: {count}")
    else:
        print(f"Errore: la colonna '{column_name}' non esiste nel DataFrame.")

print_word_frequencies(df,"Keywords")

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

#important to set our stopwdords
stopwords = set(['fa', 'solo', 'fatto', 'fare', 'UNK', 'unk','[UNK].', 'va', 'sempre','poi','é','mai','davvero','me', 'cosa','dire', 'due'])
all_words = [word for sublist in df['Keywords'] for word in sublist if word not in stopwords]

text = ' '.join(all_words)

wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stopwords, max_words=200).generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


## Topic Modeling

In [None]:
df=pd.read_csv("df_with_keywords.csv")

In [None]:
#Importing dependencies
from bertopic import BERTopic
import stanza


In [None]:
#Download and initialize Stanza pipeline for Italian language
#Tasks: tokenization, multi-word token expansion, and lemmatization processors
stanza.download('it')
nlp = stanza.Pipeline(lang='it', processors='tokenize,mwt,lemma')

In [None]:
#Removing stopwords from the text
text_without_stopwords = [' '.join([word for word in text.split() if word not in italian_stopwords]) for text in df["Text"]]
df.insert(5, "Text without stopwords", text_without_stopwords)

In [None]:
#Thanks to Stanza, lemmatizing the text without stopwords
text_lemmatized_no_stpw = [
    ' '.join([word.lemma for sent in nlp(' '.join([word for word in text.split() if word not in italian_stopwords])).sentences for word in sent.words])
    for text in df["Text"]]

df.insert(6, "Lemmatized text without stopwords", text_lemmatized_no_stpw)

In [None]:
display(df.sample(5, random_state=50))
#df.to_csv('df_lemma_txt.csv', index=False) --already saved

Despite our efforts, the topic modeling conducted on the lemmatized text without stopwords yielded unsatisfactory results. However, we have retained the lemmatized text within the dataset for potential future applications.

```
df_topic_modeling=df[df['Lemmatized text without stopwords'].notnull()]
model = BERTopic()
topics, probabilities = model.fit_transform(df_topic_modeling['Lemmatized text without stopwords'])
topic_info=model.get_topic_info()
display(topic_info)  
```

In [None]:
df=pd.read_csv("df_lemma_txt.csv")

In [None]:
df_topic_modeling=df[df["Text without stopwords"].notnull()]

In [None]:
#BERTopic
model = BERTopic()
topics, probabilities = model.fit_transform(df_topic_modeling["Text without stopwords"])

In [None]:
df_topic_modeling["Topic"]=topics

In [None]:
topic_info=model.get_topic_info()
display(topic_info)

In [None]:
#Topic ID with keywords and Relevance score
topics_data = []

for index, row in topic_info.iterrows():
    topic_number = row['Topic']
    topic_words = model.get_topic(topic_number)
    #Append topic number and corresponding keywords to the list as a dictionary
    topics_data.append({'Topic': topic_number, 'Keywords': topic_words})
    print(f"Topic {topic_number}: {topic_words}")

topics_df = pd.DataFrame(topics_data)

#topics_df.to_csv('generated_topics.csv', index=False) --already saved

In [None]:
model.visualize_topics()

In [None]:
#Function that displays topic based on a query which we'll ask next
def display_top_topics(query, n=10):
    topics = model.find_topics(str(query), top_n=n)
    print(f"\n\"{query}\" Related Topic:")
    for topic in topics:
        print(f"{model.get_topic(topic[0])}")

In [None]:
#Most important keywords from keywords extraction
queries = ["chiara", "ferragni", "beneficenza", "vergogna", "soldi", "truffa"]
for query in queries:
    display_top_topics(query)

In [None]:
#Creation of a list of topics not related to the case of study, done manually
unrelated_topics=[15, 43, 47, 125, 130, 290, 334, 352, 443, 489]

In [None]:
df = pd.merge(df, df_topic_modeling[['Source','Username', 'Date', 'Text', 'Topic']], on=['Source','Username', 'Date', 'Text'], how='left')
df.fillna(-1, inplace=True) #Replace NaN values with most common topic

In [None]:
df= df[~df['Topic'].isin(unrelated_topics)]
df.reset_index(drop=True, inplace=True)

#df.to_csv('df_case-related_topics.csv', index=False) --already saved

## Entity Recognition

In [None]:
df=pd.read_csv("df_case-related_topics.csv")

In [None]:
#Importing dependencies
from transformers import BertTokenizerFast, BertForTokenClassification, pipeline

In [None]:
#Initialize tokenizer and model
#The model is for token classification, it uses a pre-trained Italian BERT model fine-tuned for NER
tokenizer = BertTokenizerFast.from_pretrained("nickprock/bert-italian-finetuned-ner")
model = BertForTokenClassification.from_pretrained("nickprock/bert-italian-finetuned-ner")

In [None]:
#Now, we can initialize the Named Entity Recognition
ner_pipeline = pipeline("ner", model = model, tokenizer = tokenizer, aggregation_strategy="simple")

In [None]:
#Function to perform NER
def ner(text):
    segments = split_text_into_segments(text) #to accomodate limits
    all_entities = []
    for segment in segments:
        entities = ner_pipeline(segment)
        all_entities.extend(entities)
    return all_entities

### Named Entity Recognition (NER) is performed without removing stopwords as their removal could potentially compromise its performance.

In [None]:
df["NER"]=df["Text"].apply(ner) 

In [None]:
display(df.sample(5))
#df.to_csv('df_with_NER.csv', index=False) --already saved

In [None]:
print(len(df["NER"]))
a=0
for i in df["NER"]:
    if i!="[]":
        a+=1
print(a)

The suboptimal performance of the BERT-IT NER model can be attributed to intrinsic factors such as:

- Input text quality: The quality of the input text can significantly influence the model's performance. If the text contains many abbreviations, spelling errors, unclear sentences, or automatically generated text, the model may struggle to correctly recognize entities.

- Text length: The length of the text can affect the model's performance. If the text is too short or too long, it might be challenging for the model to accurately detect entities, especially if there are many padding tokens [PAD] in the BERT model.

- Linguistic variations: Although the model is trained on Italian, it may not cover all linguistic variations or dialects. If the text contains words or phrases in regional dialects or specialized languages, the model may not be able to recognize them as entities.

- Pre-trained NER model: The BERT model used may not have been trained on a representative dataset of all types of input text. If the text presents a domain-specific or context not covered by the model's training, it may struggle to correctly recognize entities.

# CONTEXT SENSITIVE SENTIMENT ANALYSIS

### The osiria/bert-tweet-italian-uncased-sentiment model already considers context in its predictions because it is based on a version of BERT, which is sensitive to the contextual meaning of words in the text.

In [None]:
df=pd.read_csv('df_with_NER.csv')
topics=pd.read_csv('generated_topics.csv')

In [None]:
#Importing dependencies
from transformers import BertTokenizerFast, BertForSequenceClassification, pipeline
import torch

In [None]:
def transform_keywords(keywords):
    cleaned_keywords = keywords.replace("[", "").replace("]", "").replace("'", "")
    
    words_with_scores = [word.strip("()").split(", ") for word in cleaned_keywords.split("), (")]
    
    words_only = [pair[0] for pair in words_with_scores]
    
    transformed_keywords = ' '.join(words_only)
    
    return transformed_keywords

In [None]:
topics["Keywords"]=topics["Keywords"].apply(transform_keywords)

In [None]:
#Initialize tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("osiria/bert-tweet-italian-uncased-sentiment")
model = BertForSequenceClassification.from_pretrained("osiria/bert-tweet-italian-uncased-sentiment")
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [None]:
#Finally, the function to predict the sentiment
def predict_sentiment_topic(topic):
    inputs = tokenizer(topic, return_tensors="pt", truncation=True, padding=True, max_length=512) #always for the input limit

    #Prediction!
    with torch.no_grad():
        outputs = model(**inputs)

    #Probability
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    class_names = ['negative', 'positive']
    predicted_class = class_names[predictions.argmax()]

    return predicted_class

In [None]:
topics["Topic Sentiment"]=topics["Keywords"].apply(predict_sentiment_topic)

In [None]:
df = pd.merge(df, topics[['Topic', 'Topic Sentiment']], on=['Topic'], how='left')

In [None]:
#This function is used to 
def predict_sentiment_with_context(text, title, topic_sentiment):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    class_names = ['negative', 'positive']
    predicted_class = class_names[predictions.argmax()]
    probabilities = predictions.numpy().flatten()
    probability_dict = {class_names[i]: probabilities[i] for i in range(len(class_names))}

    #This part is to specify the context in which we are
    if any(word in title.lower() for word in ['scusa', 'chiede scusa', 'chiede perdono', 'perdono']):
        context = 'neutral or positive'
    elif any(negative_word in title.lower() for negative_word in ['scandalo', 'truffa', 'condanna', 'espresso', 'cesso', 'brutta', 'oscuro', 'multa', 'multata', 'maximulta']):
        context = 'negative'
    else:
        context = 'neutral or positive'

    #Setting the rules to better define the logic fo the sentiment analysis context-based
    if topic_sentiment == 'positive' and predicted_class == 'positive':
        if context == 'negative':
            predicted_class = 'negative'
    return predicted_class, probability_dict

In [None]:
df['Sentiment'] = df.apply(lambda row: predict_sentiment_with_context(row['Text'], row['Title'], row['Topic Sentiment']), axis=1)
# df..to_csv('df_with_sentiment.csv', index=False) --already saved

In [None]:
display(df.sample(5))

## Study of sentiments

In [None]:
df=pd.read_csv('df_with_sentiment.csv')

In [None]:
#Group data by 'Source' and 'Sentiment', and count the number of posts
sentiment_by_source = df.groupby('Source')['Sentiment'].value_counts().unstack()

#Plotting the results
plt.figure(figsize=(14, 7))
sentiment_by_source.plot(kind='bar', stacked=True)
plt.title('Sentiment by Source')
plt.xlabel('Source')
plt.ylabel('Number of Comments')
plt.xticks(rotation=45)
plt.legend(title='Sentiment')
plt.show()


In [None]:
#Change sentiment labels
df['Sentiment 2'] = df['Sentiment'].replace({'positive': 'with Chiara Ferragni', 'negative': 'against Chiara Ferragni'})

#Group data by 'Source' and 'Sentiment', and count the number of posts
sentiment_by_source= df[df["Date"]>="2024-03-10"].groupby('Source')['Sentiment 2'].value_counts().unstack()

plt.figure(figsize=(14, 7))
sentiment_by_source.plot(kind='bar', stacked=True)
plt.title("Sentiment after Espresso, Ferragni S.p.a.")
plt.xlabel('Social Media')
plt.ylabel('Number of Comments')
plt.xticks(rotation=45)
plt.legend(title='Sentiment')
plt.show()