# Translation Source Dialect Identification
The goal of the competition is to predict the native-dialect of a text based on its translation in different languages.

In [23]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [24]:
# Read the data
data_path = '../Lab 5/'
train_data_df = pd.read_csv(os.path.join(data_path, 'train_data.csv'))

## Preprocesarea datelor
Pentru a extrage informatiile necesare din text, facem urmatorii pasi:
- eliminam caracterele speciale si cifrele
- convertim litere mari la litere mici
- eliminam cuvintele de legatura (stopwatches)
- lematizam cuvintele in functie de limba

In [25]:
import re
import nltk
from nltk.corpus import stopwords 
from simplemma import lemmatize
from simplemma import lang_detector

In [26]:
nltk.download('stopwords') # Download the stop words

danish_stopwords = stopwords.words('danish')
spanish_stopwords = stopwords.words('spanish')
italian_stopwords = stopwords.words('italian')
dutch_stopwords = stopwords.words('dutch')
german_stopwords = stopwords.words('german')

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
# Data preprocessing

def preprocesare_text(text):

    text = re.sub(r'\d-', '', text) # Eliminarea cifrelor
    cuvinte = re.findall(r'\w+', text) # Extragerea cuvintelor din text

    # Convertirea cuvintelor in litere mici
    cuvinte = [cuvant.lower() for cuvant in cuvinte]

    # M-am gandit ca nu ar trebui sa eliminam stop words pentru ca poate exista 
    # un cuvant care este stop word in limba germana dar nu si in limba daneza.
    # Astfel, vom identifica limba textului si vom elimina stop words 
    # din limba respectiva.

    limba = lang_detector(text, lang=('da', 'de', 'es', 'it', 'nl'))
    limba = limba[0][0]

    # Eliminarea cuvintelor de legatura in functie de limba textului
    if limba == 'da':
        cuvinte = [cuvant for cuvant in cuvinte if cuvant not in danish_stopwords]
    elif limba == 'de':
        cuvinte = [cuvant for cuvant in cuvinte if cuvant not in german_stopwords]
    elif limba == 'es':
        cuvinte = [cuvant for cuvant in cuvinte if cuvant not in spanish_stopwords]
    elif limba == 'it':
        cuvinte = [cuvant for cuvant in cuvinte if cuvant not in italian_stopwords]
    else:
        cuvinte = [cuvant for cuvant in cuvinte if cuvant not in dutch_stopwords]

    # Lematizarea cuvintelor in functie de limba
    cuvinte = [lemmatize(cuvant, limba) for cuvant in cuvinte]

    return cuvinte

Mai multe detalii despre libraria [simplemma](https://adrien.barbaresi.eu/blog/simple-multilingual-lemmatizer-python.html)


### Aplicam functia de preprocesare intregului set de date

In [36]:
X = [ preprocesare_text(text) for text in train_data_df['text'] ] # Pentru a nu rula preprocesarea de fiecare data
y = list(train_data_df['label'])


In [21]:

# # Eliminarea cuvintelor care apar mai putin de 5 ori
# cuvinte = []
# for text in X['text']:
#     cuvinte.extend(text)

# cuvinte = pd.Series(cuvinte)
# cuvinte = cuvinte.value_counts()
# cuvinte = cuvinte[cuvinte > 5]

# # Eliminarea cuvintelor care nu apar in lista de cuvinte
# X['text'] = X['text'].apply(lambda x: [cuvant for cuvant in x if cuvant in cuvinte.index])


## Impartirea datelor in train si test
Pentru a imparti datele in mod optim se va folosi din libraria `sklearn.model_selection` pachetul `train_train_split`

In [37]:
# Impartirea datei in train si test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(len(X_train), len(X_test), len(y_train), len(y_test))

33256 8314 33256 8314


## Bag of Words


In [38]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizarea cuvintelor
vectorizer = CountVectorizer(analyzer=preprocesare_text)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


TypeError: expected string or bytes-like object

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Antrenarea modelului
model = MultinomialNB()
model.fit(X_train, y_train)

# Testarea modelului
predictions = model.predict(X_test)
print(accuracy_score(y_test, predictions))