### Noah Oriano, noaori-4@student.ltu.se, 040114-T130 ###
### D7041E Lab 3 ###

Below are general function definitions and global variables related to the gathering of the data

Task 1.1 Import datasets into Jupyter environment

In [3]:
import numpy as np
from matplotlib import pyplot as plt
import glob
import os
import unidecode
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from nltk.util import ngrams
import math
# I have manually downloaded and stored the 21 datasets (30k size) in the news_datasets folder.
# I will load these datasets, preprocess them and store them as numpy files.
# I will store these numpy variables in seperate files. This is just to make this code more flexible if I want to use larger datasets.

# First, I will define some funtions for gathering the data
base_folder = 'news_datasets_30k'
base_path = 'news_datasets_30k/'
processed_path = 'processed_datasets_30k/'
reload_data = True
sample_size = 1000
# Use n=3 (tri-grams). Use length of HD vectors d1=100 and d2=1000.
symbol_size = 3
d1 = 100
d2 = 1000
alphabet = 'abcdefghijklmnopqrstuvwxyz ' # 27 characters
# The name of the dataset is the dataset folder's first word in its name
def get_dataset_name(dataset_folder):
    return dataset_folder.split('_')[0]
# Returns the dataset folder path without base path, base path must be included individually
def get_dataset_folder_paths():
    paths = glob.glob(base_path + '*/')
    paths = [path.replace(base_folder + "\\", '') for path in paths]
    return paths

def process_data(data):
    # Decode the data to remove accents and other special characters and convert them to their closest latin
    data = unidecode.unidecode(data)
    # Randomize order of sentences
    data = data.split('\n')
    np.random.shuffle(data)
    data = '\n'.join(data)
    # Convert the data so that it only contains values in the alphabet
    data = data.lower()
    data = ''.join(e for e in data if e in alphabet)
    # When we have multiple spaces, we will replace them with a single space
    data = ' '.join(data.split())
    # Convert data into numpy array
    data = np.array(list(data))
    return data

# Returns data as a dictionary with dataset names as keys and their data as values
def get_data_from_folder(folder_path):
    paths = glob.glob(base_path + folder_path + '/*.txt') # Only text files, not the sql files
    language_data = {}
    for path in paths:
        with open(path, 'r', encoding = "utf-8") as file:
            name = path.split('-')[-1].split('.')[0] # Example : pol_news_2023_30K\pol_news_2023_30K-inv_so.txt -> inv_so
            # Read data in UTF-8 format
            temp_data = file.read()
            language_data[name] = process_data(temp_data)
    return language_data

# Save the data as numpy files
def save_data_as_numpy(language_data, folder_path):
    # Check if the folder exists, if not create it
    if not os.path.exists(processed_path + folder_path):
        os.makedirs(processed_path + folder_path)
    for key in language_data.keys():
        np.save(processed_path + folder_path + key, language_data[key])
    


The below code uses the above code and is just for preprocessing data into the folders. 
This is seperate so it does not have to be rerun with following code that handles more complex functionality. The cobined data variable can also just be used in later code with the current setup since the data is not memory intensive.

In [4]:
if(reload_data):
    # First, get the dataset folder paths
    dataset_folder_paths = get_dataset_folder_paths()
    print('The dataset folder paths are :', dataset_folder_paths)
    # Get the data from the folders
    combined_data = {}
    for folder_path in dataset_folder_paths:
        language_data = get_data_from_folder(folder_path)
        classifier_name = get_dataset_name(folder_path)
        combined_data[classifier_name] = language_data
        save_data_as_numpy(language_data, folder_path)
    print('All data is saved as numpy files in the processed_datasets folder.')
    print('The data is ready to be used in the next step.')

    # Save the combined data as a numpy file
    np.save('combined_data', combined_data)
    print('The combined data is saved as a numpy file.')
    # Data will be stored as a dictionary within a dictionary. 
    # The first dictionary is classifier names and the second dictionary is the dataset names.
else:
    combined_data = np.load('combined_data.npy', allow_pickle=True).item()

The dataset folder paths are : ['bul_news_2022_30K\\', 'ces_news_2023_30K\\', 'dan_news_2022_30K\\', 'deu_news_2023_30K\\', 'ell_news_2023_30K\\', 'eng_news_2023_30K\\', 'est_news_2022_30K\\', 'fin_news_2022_30K\\', 'fra_news_2023_30K\\', 'hun_news_2023_30K\\', 'ita_news_2023_30K\\', 'lav_news_2020_30K\\', 'lit_news_2020_30K\\', 'nld_news_2023_30K\\', 'pol_news_2023_30K\\', 'por_news_2023_30K\\', 'ron_news_2022_30K\\', 'slk_news_2020_30K\\', 'slv_news_2020_30K\\', 'spa_news_2023_30K\\', 'swe_news_2023_30K\\']
All data is saved as numpy files in the processed_datasets folder.
The data is ready to be used in the next step.
The combined data is saved as a numpy file.


In [5]:
print(combined_data.keys())
print(combined_data['pol'].keys())
print(combined_data['pol']['sentences'].shape)
print(combined_data['pol']['sentences'][2])

dict_keys(['bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fin', 'fra', 'hun', 'ita', 'lav', 'lit', 'nld', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe'])
dict_keys(['co_n', 'co_s', 'inv_so', 'inv_w', 'meta', 'sentences', 'sources', 'words'])
(2713780,)
i


Seperate the data into samples

In [6]:
# Rather than using sentences, the classifier will use "symbols"
number_of_languages = len(combined_data.keys())
initial_data_sets = []
for key in combined_data.keys():
    data_to_add = combined_data[key]['sentences']
    # Convert the data into n-gram symbols
    data_to_add = [data_to_add[(i*symbol_size):((i+1)*symbol_size)] for i in range(len(data_to_add)//symbol_size)]
    if(key == 'pol'):
        print(data_to_add[0])
    initial_data_sets.append((data_to_add, key))


['d' 'z' 'i']


In [7]:
print(initial_data_sets[0][0][0])

# Now, to get the paired data we need to convert the data into strings instead of nparrays
paired_data_sets = []
for data, key in initial_data_sets:
    data = [''.join(data[i]) for i in range(len(data))]
    paired_data_sets.append((data, key))
print(paired_data_sets[0][0][0])
print(paired_data_sets[0][1])
    

['l' 'i' 'p']
lip
bul


In [8]:
print(len(paired_data_sets))
print(len(paired_data_sets[0]))
print(len(paired_data_sets[0][0]))
print(len(paired_data_sets[0][0][0]))
print(paired_data_sets[0][1])
print(paired_data_sets[0][0][0:5])
print(paired_data_sets[0][0][1])

21
2
1094079
3
bul
['lip', 'sa ', 'na ', 'leg', 'la ']
sa 


Below are the definitions for functions and test data segregation

In [None]:
def norm(v):
    value = np.sqrt((np.dot(v, v)) + 0.001)
    return value if value else 0.001

# Generate high-dimensional vectors
def generate_hd_vector(d):
    return np.random.choice([-1, 1], size=d)

# Encode n_gram using character mappings
def encode_n_gram(n_gram, char_to_hd, dim):
    default_vector = generate_hd_vector(dim)  
    return np.sum([char_to_hd.get(char, default_vector) for char in n_gram], axis=0)

# Compute language centroid
def compute_language_centroid(ngram_vectors):
    centroid = np.sum(ngram_vectors, axis=0)
    return centroid / (norm(centroid) + 0.01)

# Check the cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))

# Classify a query vector against centroids
def classify_text(query_vector, centroids):
    similarities = {language: cosine_similarity(query_vector, centroid) for language, centroid in centroids.items()}
    return max(similarities, key=similarities.get)

# Generate encoded n-grams for a text
def generate_encoded_ngrams(sample, n, d, char_to_hd):
    ngram_vectors = []
    for n_gram in sample:
        if(len(n_gram) < n):
            print('Skipping ngram :', n_gram)
            continue
        encoded = encode_n_gram(n_gram, char_to_hd, d)
        ngram_vectors.append(encoded)
    return ngram_vectors



## Question: what will be the size of the n-gram input vector in conventional (local) representation? ##

The N-gram statisics woud have a size equal to A^n, where n is the number of characters in a symbol and A is the size of the alphabet. In this model, I am using tri-grams with A = 27, as there are 26 letters plus the space character. This gives 27^3 = 19683 statistics.

The HD vector generated that is associated with each symbol is of size equal to the given dimension, in this case 1000.

## Question: Identify difficulties of working with conventional representations of n-grams in the machine learning context.

Data is sparse, some symbols appear much more often. On top of this, the feature space grows exponentially with the size of the alphabet and n. This makes n-grams somewhat un-effecient in terms of memory or computation. N-grams do not handle contextual information, since they have a small window. The cost of computation also grows quickly with dimensionality as well, and a lot of data is needed for effecient handling of larger paramaters (such as larger dimensionality and N).


In [10]:
# Split the paired data sample symbols into groups of 1000
new_data_pairing = []
sample_size = 1000
index = 0
for symbols, key in paired_data_sets:
    language_new_pairings = []
    for i in range(0, len(symbols), sample_size):
        language_new_pairings.append((symbols[i:i+sample_size]))
        paired_data_sets
    # For replace the symbols with their symbol label pairs
    language_new_pairings = [[(symbol, key) for symbol in sample] for sample in language_new_pairings]
    new_data_pairing.append(language_new_pairings)
    index += 1

# Each item is now combined, now we can combine all of the languages
combined_items = []
for language in new_data_pairing:
    for sample in language:
        combined_items.append(sample)
print(len(combined_items))
print(len(combined_items[0]))
print(len(combined_items[0][0]))
print(len(combined_items[0][0][0]))
print(len(combined_items[0][0][1]))

# Now, we can create the combined samples and labels
all_samples = []
all_labels = []
for sample in combined_items:
    for symbol, label in sample:
        all_samples.append(symbol)
        all_labels.append(label)
        

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_samples, all_labels, test_size=0.2, random_state=42)

# Sort the training data into bins based on language
training_data_by_language = {}
for(sample, label) in zip(X_train, y_train):
    if(label not in training_data_by_language):
        training_data_by_language[label] = []
    training_data_by_language[label].append(sample)

testing_data_by_language = {}
for(sample, label) in zip(X_test, y_test):
    if(label not in testing_data_by_language):
        testing_data_by_language[label] = []
    testing_data_by_language[label].append(sample)


22510
1000
2
3
3


In [11]:
# Split the training data into samples of size 1000
new_training_data = {}
for key in training_data_by_language.keys():
    for i in range(0, len(training_data_by_language[key]), sample_size):
        if(key not in new_training_data):
            new_training_data[key] = []
        new_training_data[key].append((training_data_by_language[key][i:i+sample_size]))
new_testing_data = {}
for key in testing_data_by_language.keys():
    for i in range(0, len(testing_data_by_language[key]), sample_size):
        if(key not in new_testing_data):
            new_testing_data[key] = []
        new_testing_data[key].append((testing_data_by_language[key][i:i+sample_size]))

print(len(new_testing_data))
print(len(new_testing_data["slv"]))
print(len(new_testing_data["slv"][0]))
print(len(new_testing_data["slv"][0][0]))
print((new_testing_data["slv"][0][0]))



21
221
1000
3
jet


In [12]:
training_data_by_language = new_training_data
testing_data_by_language = new_testing_data

In [16]:
# Check that all of the samples are of size 1000
revised_training_data = {}
for language in training_data_by_language.keys():
    for sample in training_data_by_language[language]:
        if(len(sample) != 1000):
            print(len(sample))
            print('Error')
        else:
            if(language not in revised_training_data):
                revised_training_data[language] = []
            revised_training_data[language].append(sample)

errors = False
for language in revised_training_data.keys():
    for sample in revised_training_data[language]:
        if(len(sample) != 1000):
            print(len(sample))
            print('Error2')
            errors = True
if(not errors):
    print('All samples are of size 1000')
    training_data_by_language = revised_training_data

# Do the above for testing data as well
revised_testing_data = {}
for language in testing_data_by_language.keys():
    for sample in testing_data_by_language[language]:
        if(len(sample) != 1000):
            print(len(sample))
            print('Error')
        else:
            if(language not in revised_testing_data):
                revised_testing_data[language] = []
            revised_testing_data[language].append(sample)

errors = False
for language in revised_testing_data.keys():
    for sample in revised_testing_data[language]:
        if(len(sample) != 1000):
            print(len(sample))
            print('Error2')
            errors = True
if(not errors):
    print('All samples are of size 1000')
    testing_data_by_language = revised_testing_data

# Test the generation of the centroids
for d in [d1, d2]:
    n = symbol_size
    # Generate HD vectors for characters
    char_to_hd = {char: generate_hd_vector(d) for char in alphabet}
    print("HD Vectors generated")

    # Compute centroids for each language
    language_centroids = {}
    for language, samples in training_data_by_language.items():
        print(language)
        if language == "swe":
            ngram_vectors = generate_encoded_ngrams(samples, n, d, char_to_hd)
            language_centroids[language] = compute_language_centroid(ngram_vectors)
            print(f"Language: {language}, Centroid: {language_centroids[language][:10]}...")  # Print first 10 elements of the centroid for brevity
        else:
            break

All samples are of size 1000
All samples are of size 1000
HD Vectors generated
swe


  value = np.sqrt((np.dot(v, v)) + 0.001)


Language: swe, Centroid: [nan nan nan nan nan nan nan nan nan nan]...
hun
HD Vectors generated
swe
Language: swe, Centroid: [-0.08801685 -0.35206739  0.17603369 -2.02438749 -0.88016847  0.
 -0.26405054 -3.08058966 -1.05620217 -0.96818532]...
hun


In [14]:
print(training_data_by_language.keys())

dict_keys(['swe', 'hun', 'ita', 'est', 'slk', 'dan', 'fin', 'lit', 'ron', 'spa', 'deu', 'pol', 'eng', 'ces', 'fra', 'bul', 'por', 'slv', 'ell', 'lav', 'nld'])


Below is the graph and results

In [21]:
for d in [d1, d2]:
    n = symbol_size
    # Generate HD vectors for characters
    char_to_hd = {char: generate_hd_vector(d) for char in alphabet}

    # Compute centroids for each language
    language_centroids = {}
    for language, samples in training_data_by_language.items():
        vectors = []
        for sample in samples:
            ngram_vectors = generate_encoded_ngrams(samples, n, d, char_to_hd)
            # if the vector contains NaN values, print the language and the centroid, do not include it
            if(np.isnan(ngram_vectors).any()):
                print('Error')
                print(language)
                print(ngram_vectors[0])
                continue
            vectors.extend(ngram_vectors)
        language_centroids[language] = compute_language_centroid(vectors)
        if(np.isnan(language_centroids[language]).any()):
            print('Error')
            print(language)
            print(language_centroids[language])

    print("Centroids computed for each language")

    # Classify and evaluate on the test set
    y_pred = []
    for sample, predicted_label in zip(X_test, y_test):
        query_vector = np.mean(generate_encoded_ngrams(sample, n, d, char_to_hd), axis=0)
        predicted_label = classify_text(query_vector, language_centroids)
        y_pred.append(predicted_label)

    # Evaluation with proper labels parameter
    labels_set = list(language_centroids.keys())

    print("Classification Report:")
    print(classification_report(y_test, y_pred, labels=labels_set))


    # Assuming you have your y_true (test_labels) and y_pred (y_pred_europarl)
    cm = confusion_matrix(y_test, y_pred, labels=labels_set)

    # Create a heatmap using seaborn
    plt.figure(figsize=(12, 10))
    sns.set(font_scale=1.2)
    ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, 
                    xticklabels=labels_set, yticklabels=labels_set, linewidths=0.5, linecolor='gray')

    # Add labels, title and axis ticks
    plt.title('Confusion Matrix', fontsize=14)
    plt.xlabel('Predicted Labels', fontsize=12)
    plt.ylabel('True Labels', fontsize=12)

    # Rotate the tick labels for better readability
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)

    # Show the plot
    plt.tight_layout()
    plt.show()


  value = np.sqrt((np.dot(v, v)) + 0.001)


Error
ita
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan]
Error
est
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan]


KeyboardInterrupt: 