In [22]:
import pandas as pd
import re

# Load the Excel file
file_path = './RTVSlo/Podatki - PrometnoPorocilo_2022_2023_2024.xlsx'
data = pd.read_excel(file_path)

# Limit to the first 100 rows
data = data.head(10000)

# Function to remove HTML tags
def remove_html_tags(text):
    if isinstance(text, str):
        return re.sub(r'<[^>]*>', '', text)
    return text

# Process the data
processed_data = []
for index, row in data.iterrows():
    item = {}
    for column in data.columns:
        value = row[column]
        if pd.notna(value):  # Check if the cell is not empty
            item[column] = remove_html_tags(value)
    processed_data.append(item)

# Convert the processed data into a DataFrame
processed_df = pd.DataFrame(processed_data)

# Remove 'LegacyId' and 'Operater' columns if they exist
columns_to_remove = ['LegacyId', 'Operater']
processed_df = processed_df.drop(columns=[col for col in columns_to_remove if col in processed_df.columns])

# Save the processed data to a new Excel file (optional)
# processed_df.to_excel('./RTVSlo/Processed_Data_First_10000.xlsx', index=False)

processed_df.to_csv('./RTVSlo/Processed_Data_First_10000.csv', index=False)


In [23]:
# Importing pandas to check the number of rows in the file
import pandas as pd

# Load the Excel file
file_path = './RTVSlo/Podatki - PrometnoPorocilo_2022_2023_2024.xlsx'
data = pd.read_excel(file_path)

# Get the number of rows in the dataset
num_rows = data.shape[0]
num_rows


55001

In [27]:
import pandas as pd
import json

# Load the CSV file
file_path = './RTVSlo/Processed_Data_First_10000.csv'  # Adjust path if necessary
data = pd.read_csv(file_path)

# Set pandas options to prevent truncation of long text
pd.set_option('display.max_colwidth', None)  # This ensures that columns with long text are fully shown

# Initialize a list to store the processed rows
processed_data = []

# Iterate through each row in the DataFrame
for index, row in data.iterrows():
    row_dict = {}  # Dictionary to store non-NaN columns for the current row
    
    # Iterate through each column and check if it's not NaN
    for column, content in row.items():
        if pd.notna(content):  # Check if the content is not NaN
            row_dict[column] = content
    
    # Add the row dictionary to the processed data if it has any non-NaN values
    if row_dict:
        processed_data.append(row_dict)

# Save the processed data to a JSON file
json_file_path = './RTVSlo/Processed_Data_First_10000_non_nan.json'
with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(processed_data, f, ensure_ascii=False, indent=4)

# Print confirmation message
print(f"Processed data has been saved to {json_file_path}")


Processed data has been saved to ./RTVSlo/Processed_Data_First_10000_non_nan.json


In [26]:

# Count the number of non-NaN values in each column
non_nan_counts = data.notna().sum()

# Print the counts for each column
for column, count in non_nan_counts.items():
    print(f"{column}: {count}  values")

Datum: 10000  values
B1: 9999  values
ContentVremeSLO: 4771  values
TitleDeloNaCestiSLO: 9781  values
ContentDeloNaCestiSLO: 8478  values
TitleOpozorilaSLO: 4011  values
ContentOpozorilaSLO: 1490  values
ContentNesreceSLO: 3028  values
A1: 686  values
ContentPomembnoSLO: 686  values
ContentOvireSLO: 5303  values
ContentMednarodneInformacijeSLO: 6110  values
ContentZastojiSLO: 3520  values
TitleOvireSLO: 765  values
TitleSplosnoSLO: 4824  values
ContentSplosnoSLO: 3007  values
TitleVremeSLO: 3608  values
TitleZastojiSLO: 520  values
A2: 1  values
C2: 162  values
TitleNesreceSLO: 604  values


In [24]:
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from striprtf.striprtf import rtf_to_text

# Function to load and read all rtf files in a directory
def load_rtf_files(directory):
    rtf_files = []
    for filename in os.listdir(directory):
        if filename.endswith(".rtf"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as file:
                rtf_content = file.read()
                text = rtf_to_text(rtf_content)  # Convert RTF to plain text
                rtf_files.append((filename, text, rtf_content))  # Store the original RTF content too
    return rtf_files

# Preprocessing function to clean and tokenize text
def preprocess_text(text):
    # Remove special characters and digits, and make all lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    words = text.split()
    return ' '.join(words)

# Function to compute the cosine similarity between b1_contents and RTF contents
def compute_similarity(b1_contents, rtf_files):
    # Preprocess b1_contents
    b1_contents = preprocess_text(b1_contents)
    
    # Preprocess all RTF file contents
    rtf_texts = [(filename, preprocess_text(text), rtf_content) for filename, text, rtf_content in rtf_files]
    
    # Combine the texts: one for b1_contents and others for RTF files
    all_texts = [b1_contents] + [text for _, text, _ in rtf_texts]
    
    # Use TfidfVectorizer to vectorize the text data
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Compute cosine similarity between b1_contents (first row) and all RTF file texts
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
    
    # Get similarity scores for each RTF file, including the original RTF content
    similarities = [(rtf_files[i][0], cosine_similarities[0][i], rtf_files[i][2]) for i in range(len(rtf_files))]
    
    # Sort by similarity score (highest first)
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    return similarities

# Directory path where the RTF files are located
directory = '/Users/petrakuralt/Desktop/mag 2.letnik/NLP/Project/github_project/ul-fri-nlp-course-project-2024-2025-jazbeci/RTVSlo/Podatki - rtvslo.si/Promet 2022/Marec 2022'

# Example of b1_contents, replace this with actual contents you want to compare
# b1_contents = "VremePonekod po Sloveniji gosta megla v pasovih zmanjšuje vidljivost. Prilagodite hitrost in varnostno razdaljo!Omejitve za tovorna vozilaOd 30. decembra je v veljavi sprememba omejitve za tovorna vozila nad 7,5 ton. Več."
# b1_contents = "ZastojiNa gorenjski avtocesti pred predorom Karavanke proti Avstriji, 1 kilometer. Za Kranjsko Goro uporabite izvoz Jesenice vzhod, Lipce. Popolna zapora cesteDo 14. uro bo na cesti Lipnica - Kropa - Železniki zaradi prireditve zaprt odsek Lajše - Rudno. Obvoz je na relaciji Rudno - Železniki - Selca - Lajše in obratno.Mejni prehodiČakalna doba je na mejnih prehodih Starod, Obrežje, Bistrica ob Sotli, Rogatec, Dobovec, Gruškovje, Zgornji Leskovec, Zavrč, Središče ob Dravi, Ormož in Petišovci.VremePrelaz Vršič je zaprt zaradi snega.Omejitev za tovorni prometDo 22. ure velja omejitev prometa tovornih vozil, katerih največja dovoljena masa presega 7,5 tone."
b1_contents = "DelaŠtajerska avtocesta bo zaprta med priključkoma Blagovica in Vransko proti Mariboru, do nedelje do 8. ure. Obvoz bo po regionalni cesti.Na primorski avtocesti bo promet potekal po enem prometnem pasu med Vrhniko in Brezovico proti Ljubljani predvidoma do nedelje do 16. ure.Do nedelje do 19. ure bo v Družinski vasi zaprta cesta Dolenje Kronovo - Dobrava.Na ljubljanski severni obvoznici sta zaprta uvoz Nove Jarše proti Zadobrovi in servisna cesta med Bratislavsko in Leskoškovo cesto. Obvoz je urejen preko Bratislavske in Letališke ceste do vzhodne obvoznice.PrireditveV nedeljo, med 12. in 17. uro, bodo občasne popolne zapore zaradi kolesarske prireditve: Izola - Sečovlje - Dragonja - Srgaši."
rtf_files = load_rtf_files(directory)

# Compute similarity
similar_files = compute_similarity(b1_contents, rtf_files)

# Display the most similar files with their content
print("Most similar RTF files to b1_contents:")
for filename, similarity, full_rtf in similar_files:
    print(f"Filename: {filename}\nSimilarity: {similarity:.4f}")
    print("RTF File Content (First 500 characters):")
    print(full_rtf[:500])  # Displaying the first 500 characters of the RTF file
    print("\n---\n")


Most similar RTF files to b1_contents:
Filename: TMP-433.rtf
Similarity: 0.4328
RTF File Content (First 500 characters):
{\rtf1\ansi\ansicpg1252\deff0\deflang1060{\fonttbl{\f0\fswiss\fprq2\fcharset238{\*\fname Arial;}Arial CE;}}
{\colortbl ;\red0\green0\blue0;}
\viewkind4\uc1\pard\cf1\f0\fs20 Prometne informacije       15. 03. 2022        05.30               1. in 2. program \par
\par
Podatki o prometu.\par
\par
Zaradi del sta na severni ljubljanski obvoznici zaprta uvoz Nove Jar\'9ae proti Zadobrovi ter servisna cesta med Bratislavsko in Lesko\'9akovo cesto. Obvoz je preko Bratislavske in Letali\'9ake ceste do vz

---

Filename: TMP-415.rtf
Similarity: 0.4275
RTF File Content (First 500 characters):
{\rtf1\ansi\ansicpg1252\deff0\deflang1060{\fonttbl{\f0\fswiss\fprq2\fcharset238{\*\fname Arial;}Arial CE;}}
{\colortbl ;\red0\green0\blue0;}
\viewkind4\uc1\pard\cf1\f0\fs20 Prometne informacije       15. 03. 2022        15.30           1. in  2. program \par
\par
Podatki o prometu. \par
\p