## Loading data

In [None]:
import tarfile
import pandas as pd
import os

# Assumes that the samla-dataset is stored in the data-folder within this project
#rootdir = '.\data\samla'
rootdir = f'{os.getcwd()}/data/samla'

print("Current working directory:", os.getcwd())  # Debugging statement

samla = {}

# Traversing every subfolder of the given folder in the 'rootdir' variable and finds
# every file. Stores the files in the samla-dictionary defined above.
for subdir, dirs, files in os.walk(rootdir):
    for directories in dirs:
        path = os.path.join(rootdir, directories)
        for subdir, dirs, files in os.walk(path):
            for file in files:
                if(file.startswith('.')):
                    continue
                #print(path)
                #break
                filePath = os.path.join(subdir, file)
                #print("Trying to open file:", filePath)  # Debugging statement
                #print()
                if os.path.exists(filePath):
                    with open(filePath, 'rb') as f: #, encoding='iso-8859-1'
                        if(samla.get(filePath) is not None):
                            samla[filePath] = samla[filePath] + f.read()
                        else:
                            samla[filePath] = f.read()
                else:
                    print(f"File not found: {filePath}")  # Debugging statement

In [None]:
len(samla)

## Decoding unicode data

In [None]:
df = pd.DataFrame(samla.items(), columns=['File', 'Text'])
df['File'] = df['File']
# Decoding it to become unicode characters
txt = 0
xml = 0
for i in range(len(df)):
    if df['File'][i].endswith(".txt"):
        txt += 1
        df['Text'][i] = df['Text'][i].decode("iso-8859-1", "strict")
    else:
        df['Text'][i] = df['Text'][i].decode("utf-8", "replace")
        xml +=1

txt, xml

In [None]:
df['Text'][0]

## Cleaning text-data

In [None]:
import re

text = df.iloc[0, :]["Text"]

pattern = r"<DATO>(.*?)<\/DATO>.*?<STAD>(.*?)<\/STAD>.*`?<TEKST>(.*?)<\/TEKST>"
matches = re.findall(pattern, text, re.DOTALL)

#for match in matches:
#    dato = match[0]
#    tekst = match[1]
#    print(f"Dato: {dato}, Tekst: {tekst}")
matches[0]

In [None]:
transformed_df = pd.DataFrame(columns=["Concatenated", "Text", "Place", "Date", "Type"])

pattern = r"<DATO>(.*?)<\/DATO>.*?<STAD>(.*?)<\/STAD>.*`?<TEKST>(.*?)<\/TEKST>"
skip = 0
skiplist = []

for i in range(df.shape[0]):
    text = df.iloc[i, :]["Text"]
    pattern = r"<DATO>(.*?)<\/DATO>.*?<STAD>(.*?)<\/STAD>.*`?<TEKST>(.*?)<\/TEKST>"
    matches = re.findall(pattern, text, re.DOTALL)
        
    # This is the case for 'minneoppgaver' or the two meta files about minneoppgaver (which we skip)
    if len(matches) == 0:
        
        # FOR MINNEOPPGAVER
        pattern = r"<title>(.*?)<\/title>.*?<date>(.*?)<\/date>.*?<body>(.*?)<\/body>"
        matches = re.findall(pattern, text, re.DOTALL)
        
        if len(matches) == 0:
            skip += 1
            skiplist.append(text)
            continue
        
        # Converting to list from tuple to assign items
        matches = list(matches[0])
        
        matched_text = matches[2]
        
        # Removing tags and whitespace from main-string
        clean_string = re.sub(r'<\w+(\s+\w+=".*")*/?>', '', matched_text)
        clean_string = re.sub(r'</\w+>', '', clean_string)
        clean_string = re.sub(r'\s+', ' ', clean_string)
        
        matches[2] = clean_string

        # FOR MINNEOPPGAVER, IKKE FJERN DENNE MED MINDRE DU KLARER Å FÅ MINNEOPPGAVER PÅ FORMAT SOM IKKE ER FUCKED
        transformed_df.loc[i] = [matches[1] + " " + matches[0] + " " + matches[2], matches[2], matches[0], matches[1], "Minneoppgave"]
        
        continue
    
    # Converting to list from tuple to assign items
    matches = list(matches[0])
    
    matched_text = matches[2]
    
    # Removing tags and whitespace from main-string
    clean_string = re.sub(r'<\w+(\s+\w+=".*")*/?>', '', matched_text)
    clean_string = re.sub(r'</\w+>', ' ', clean_string)
    clean_string = re.sub(r'\s+', ' ', clean_string)
        
    matches[2] = clean_string

    
    transformed_df.loc[i] = [matches[0] + " " + matches[1] + " " + matches[2], matches[2], matches[1], matches[0], "Eventyr/Sagn"]

print(skip)    
transformed_df.reset_index(drop=True)

In [None]:
## KODE FOR Å SE HVA SOM GIKK GALT UNDER INNLASTNING, IKKE FJERN

#text = skiplist[3]

#pattern = r"<title>(.*?)<\/title>.*?<date>(.*?)<\/date>.*?<body>(.*?)<\/body>"
#matches = re.findall(pattern, text, re.DOTALL)

#print(matches)
sample_string = transformed_df.iloc[500, :]

#clean_string = re.sub(r'<\w+(\s+\w+=".*")*/?>', '', sample_string)
#clean_string = re.sub(r'</\w+>', '', clean_string)
#clean_string = re.sub(r'\s+', ' ', clean_string)

#clean_string
print(sample_string["Type"])
sample_string["Text"]

## Topic modelling

In [None]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

sentence_model = SentenceTransformer("NbAiLab/nb-sbert-base")
kw_model = KeyBERT(model=sentence_model)

In [None]:
tekster = transformed_df["Text"].values

topic_model = BERTopic(embedding_model='NbAiLab/nb-sbert-base').fit(tekster)

Visualisation with all the data

In [None]:
topic_model.visualize_documents(tekster)

In [None]:
topictext = topic_model.get_document_info(tekster).iloc[1267, :]["Document"]

Visualization with eventyr/sagn

In [None]:
tekster = transformed_df.loc[transformed_df["Type"] == "Eventyr/Sagn"]["Text"].values

tp_es = BERTopic(embedding_model='NbAiLab/nb-sbert-base').fit(tekster)

In [None]:
tp_es.visualize_documents(tekster)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#vectorizer_model = CountVectorizer(stop_words=stopwords.words("norwegian"), ngram_range=(1, 5))
#tp_es.update_topics(tekster, vectorizer_model=vectorizer_model)

In [None]:
tp_es.get_topics()

In [None]:
tp_es.visualize_documents(tekster)

Visualization with minneoppgaver

In [None]:
mo_tekster = transformed_df.loc[transformed_df["Type"] == "Minneoppgave"]["Text"].values

tp_mo = BERTopic(embedding_model='NbAiLab/nb-sbert-base').fit(mo_tekster)

In [None]:
tp_mo.visualize_documents(mo_tekster)

Trying to update topics to get a more exciting representation than the regular words

In [None]:
tp_mo.update_topics(mo_tekster, n_gram_range=(1, 3))

In [None]:
tp_mo.get_topics()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

vectorizer_model = CountVectorizer(stop_words=stopwords.words("norwegian"), ngram_range=(1, 5))
tp_mo.update_topics(mo_tekster, vectorizer_model=vectorizer_model)

In [None]:
tp_mo.get_topics(tekster)

In [None]:
tp_mo.visualize_documents(mo_tekster)