# Translation Source Dialect Identification
The goal of the competition is to predict the native-dialect of a text based on its translation in different languages.

In [14]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [15]:
# Read the data
data_path = '../Lab 5/'
train_data_df = pd.read_csv(os.path.join(data_path, 'train_data.csv'))

## Preprocesarea datelor
- extragem informatiile necesare din text
- eliminam caracterele speciale si cifrele
- eliminam cuvintele de legatura (stopwatches)
- lematizam cuvintele in functie de limba

In [16]:
import re
import nltk
from nltk.corpus import stopwords 
from simplemma import lemmatize
from simplemma import lang_detector

In [17]:
nltk.download('stopwords') # Download the stop words

danish_stopwords = stopwords.words('danish')
spanish_stopwords = stopwords.words('spanish')
italian_stopwords = stopwords.words('italian')
dutch_stopwords = stopwords.words('dutch')
german_stopwords = stopwords.words('german')

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Data preprocessing

def preprocesare_text(text):

    text = re.sub(r'\d-', '', text) # Eliminarea cifrelor
    cuvinte = re.findall(r'\w+', text) # Extragerea cuvintelor din text

    # M-am gandit ca nu ar trebui sa eliminam stop words pentru ca poate exista 
    # un cuvant care este stop word in limba germana dar nu si in limba daneza.
    # Astfel, vom identifica limba textului si vom elimina stop words 
    # din limba respectiva.

    limba = lang_detector(text, lang=('da', 'de', 'es', 'it', 'nl'))
    limba = limba[0][0]

    # Eliminarea cuvintelor de legatura in functie de limba textului
    if limba == 'da':
        cuvinte = [cuvant for cuvant in cuvinte if cuvant not in danish_stopwords]
    elif limba == 'de':
        cuvinte = [cuvant for cuvant in cuvinte if cuvant not in german_stopwords]
    elif limba == 'es':
        cuvinte = [cuvant for cuvant in cuvinte if cuvant not in spanish_stopwords]
    elif limba == 'it':
        cuvinte = [cuvant for cuvant in cuvinte if cuvant not in italian_stopwords]
    else:
        cuvinte = [cuvant for cuvant in cuvinte if cuvant not in dutch_stopwords]

    # Lematizarea cuvintelor in functie de limba
    cuvinte = [lemmatize(cuvant, limba) for cuvant in cuvinte]

    return cuvinte

Mai multe detalii despre libraria [simplemma](https://adrien.barbaresi.eu/blog/simple-multilingual-lemmatizer-python.html)


### Aplicam functia de preprocesare intregului set de date

In [22]:
# Split the data into features and labels

# X = train_data_df.drop('label', axis=1)
# y = train_data_df['label']

# Aplicarea functiei de preprocesare asupra datelor
# X['text'] = X['text'].apply(preprocesare_text)

X = train_data_df['text'].apply(preprocesare_text)


In [21]:

# # Eliminarea cuvintelor care apar mai putin de 5 ori
# cuvinte = []
# for text in X['text']:
#     cuvinte.extend(text)

# cuvinte = pd.Series(cuvinte)
# cuvinte = cuvinte.value_counts()
# cuvinte = cuvinte[cuvinte > 5]

# # Eliminarea cuvintelor care nu apar in lista de cuvinte
# X['text'] = X['text'].apply(lambda x: [cuvant for cuvant in x if cuvant in cuvinte.index])
