## Imports

In [1]:
# general libraries
import os
import pathlib
import posixpath
import json
import re

In [None]:
# !pip install bertopic 
# in powershell under book env for local (no need to install again)

In [None]:
# BERTopic libraries
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

#in powershell, pip install tf-keras (no need to install again)




## Data processing

In [4]:
# import and Create new folder
import os
# from google.colab import files (i am using local computing)
import shutil

# new_folder = "Scenario_1" 

# if os.path.isdir(new_folder):
#   shutil.rmtree(new_folder)

# os.mkdir(new_folder)

# Upload Files (i am using local computing and thus code below is unncessary)
# uploaded = files.upload()
# for filename in uploaded.keys():
#   dst_path = os.path.join(new_folder, filename)
#   print(f'move {filename} to {dst_path}')
#   shutil.move(filename, dst_path)

In [5]:
def divide_year_month(data_path, month):

    dir_path=data_path
    month_list=[]
    pattern = re.compile(r"ParlaMint-IT_\d{4}-((0[1-9]|1[0-2]))-\d{2}-LEG\d+-Senato-sed-\d+\.txt")
    for file in sorted(os.listdir(dir_path)):
        res = re.match(pattern, file)
        if res == None:
            pass
        elif res.group(1) == month:
            month_list.append(file)

    return month_list

def divide_year_quarters(data_path):

    dir_path=data_path
    Q1 = []
    Q2 = []
    Q3 = []
    Q4 = []
    pattern = re.compile(r"ParlaMint-IT_\d{4}-((01|02|03)|(04|05|06)|(07|08|09)|(10|11|12))-\d{2}-[A-Z]{3,}\d{1,}-[A-Za-z]+-[a-z]{3}-\d{1,}\.txt")
    for file in sorted(os.listdir(dir_path)):
        res = re.match(pattern, file)
        if res.group(1) in ("01", "02", "03"):
            Q1.append(file)
        elif res.group(1) in ("04", "05", "06"):
            Q2.append(file)
        elif res.group(1) in ("07", "08", "09"):
            Q3.append(file)
        else:
            Q4.append(file)

    return Q1, Q2, Q3, Q4

def read_data(path, month):
    data = divide_year_month(path, month)
    documents = []
    for document in data:
        data_path = posixpath.join(path, document)
        with open(data_path, 'r', encoding='utf-8') as f:
        # Iterate directly over the file object - memory efficient
            for line in f:
                # .strip() removes leading/trailing whitespace, including the newline
                #cleaned_line = line.strip()
                # You now have a clean string for this line

                # You can append the cleaned line to a list if you need all lines first,
                # or process it directly here for the LLM.
                documents.append(line)
    return documents

In [6]:
path = './Scenario_1/Italy_Translated_2021/'
data = divide_year_month(path, "01")
lines = read_data(path, "01")

In [None]:
data 
# check the data is read in

['ParlaMint-IT_2021-01-12-LEG18-Senato-sed-290.txt',
 'ParlaMint-IT_2021-01-13-LEG18-Senato-sed-291.txt',
 'ParlaMint-IT_2021-01-14-LEG18-Senato-sed-292.txt',
 'ParlaMint-IT_2021-01-19-LEG18-Senato-sed-293.txt',
 'ParlaMint-IT_2021-01-20-LEG18-Senato-sed-294.txt',
 'ParlaMint-IT_2021-01-26-LEG18-Senato-sed-295.txt',
 'ParlaMint-IT_2021-01-27-LEG18-Senato-sed-296.txt']

In [None]:
lines
# check the conent of the file

['The sitting was opened at 4.33 p.m. The Minutes were read. The minutes of the sitting of 30 December 2020 shall be read. TOSATO, Secretary, read the Minutes of the sitting of 30 December 2020. THE PRESIDENT. - As there are no observations, the Minutes are approved. In the absence of any comments, the Minutes shall be approved.\n',
 "The list of Senators on leave and absenteeism received by the Senate, as well as further communications to the Assembly, will be published in Annex B to the minutes of today's sitting.\n",
 'I would like to inform the House that at the beginning of the sitting the Chairman of the Five Star Movement sent a request for an electronic vote under Rule 113 (2) for all the votes to be taken during the sitting.\n',
 '\n',
 '                  \n',
 'Honourable senators, I would like to express my heartfelt thoughts of closeness and condolences to the family, colleagues and friends of Senator Emilia Grazia De Biasi. A woman of brilliant intelligence and inexhaustib

## Process Data

In [9]:
# Step 1. remove the "\n" from every line
processed_lines = [line.strip("\n") for line in lines]
len(processed_lines)

# Step 2. remove the lines with fewer than 10 words.
processed_lines_test = [ ]
for line in processed_lines:
    if len(line.split()) < 15:
        pass
    else:
        processed_lines_test.append(line)
len(processed_lines_test)

1547

In [13]:
processed_lines_test

['The sitting was opened at 4.33 p.m. The Minutes were read. The minutes of the sitting of 30 December 2020 shall be read. TOSATO, Secretary, read the Minutes of the sitting of 30 December 2020. THE PRESIDENT. - As there are no observations, the Minutes are approved. In the absence of any comments, the Minutes shall be approved.',
 "The list of Senators on leave and absenteeism received by the Senate, as well as further communications to the Assembly, will be published in Annex B to the minutes of today's sitting.",
 'I would like to inform the House that at the beginning of the sitting the Chairman of the Five Star Movement sent a request for an electronic vote under Rule 113 (2) for all the votes to be taken during the sitting.',
 'Honourable senators, I would like to express my heartfelt thoughts of closeness and condolences to the family, colleagues and friends of Senator Emilia Grazia De Biasi. A woman of brilliant intelligence and inexhaustible energy, Emilia De Biasi has been a 

## BERTopic

In [14]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")
# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()
# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  language = "multilingual",
  min_topic_size=20,
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)

In [15]:
topics, probs = topic_model.fit_transform(processed_lines_test)

In [16]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,179,-1_senate_senator_committee_parliament,"[senate, senator, committee, parliament, presi...",[PRESIDENT. - The next item is the report (Doc...
1,0,1073,0_political_parliament_politics_minister,"[political, parliament, politics, minister, re...",[It is true that Italy's role has also helped ...
2,1,69,1_politician_political_politics_napolitano,"[politician, political, politics, napolitano, ...","[GRASSI (L-SP-PSd'Az). - (IT) Mr President, la..."
3,2,67,2_committee_representative_senate_proposal,"[committee, representative, senate, proposal, ...","[We shall now turn to Amendment No 1-quater.1,..."
4,3,50,3_parliament_legislative_representatives_amend...,"[parliament, legislative, representatives, ame...","[VITALI (FIBP-UDC). Mr. President, ladies and ..."
5,4,41,4_parliamentary_senate_delegation_council,"[parliamentary, senate, delegation, council, p...",[The sitting was opened at 4.33 p.m. The Minut...
6,5,29,5_amendments_directive_parliament_policy,"[amendments, directive, parliament, policy, co...",[I would like to ask the Commissioner whether ...
7,6,22,6_ratification_colleagues_perpetuates_culture,"[ratification, colleagues, perpetuates, cultur...","[Then, to talk about the revolution of the 5 S..."
8,7,17,7_harassment_discrimination_ratification_black...,"[harassment, discrimination, ratification, bla...","[ROSSOMANDO (PD). - (PT) Mr President, ladies ..."


In [17]:
topic_model.get_representative_docs()

{-1: ["PRESIDENT. - The next item is the report (Doc. Mr President, Emilia De Biasi was a cultured woman of great intellectual depth. I remember one evening, after working very hard in the Chamber, she dragged me across Rome to a theatre experiment. I remember one evening, after working very hard in the auditorium, she dragged me across Rome to see a theater experiment. It was pouring rain, but she stubbornly dragged me there. It was pouring rain, but she stubbornly dragged me there. We talked about books, movies, theater and travel. This passionate curiosity for life moved Emilia De Biasi into her political commitment. This passionate curiosity about life moved Emilia De Biasi into her political commitment. At times, she expressed unease for our political communities - who doesn't have difficult moments?At times, he expressed discomfort with our political communities - who doesn't have moments of difficulty? - but he always went ahead. She was always purposeful and confident in wantin

In [18]:
topic_model.visualize_topics()