In [3]:
import os 
import pandas as pd 

from IPython.display import HTML

In [4]:
%%capture
! pip install fasttext

# I. Language detection (24 points)

## Setup 

In [5]:
! git clone https://github.com/MastafaF/LanguageDetection.git

fatal: destination path 'LanguageDetection' already exists and is not an empty directory.


In [6]:
os.listdir("./LanguageDetection")

['results',
 'dataset.csv.zip',
 'LICENSE',
 'README.md',
 'dataset.csv',
 '.git',
 '.gitignore',
 'logs']

In [7]:
# CD the LanguageDetection folder - we are working in the below folder now
os.chdir("./LanguageDetection")

In [8]:
! unzip dataset.csv.zip

Archive:  dataset.csv.zip
replace dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


## Data Exploration Analysis

In [9]:
data = pd.read_csv("./dataset.csv")

In [10]:
# Sample of the data
HTML(data[data.language == "Chinese"].sample().to_html())

Unnamed: 0,Text,language
21310,基斯·保殊的昵称是cb，cb是他英语的首字母缩写是暴龍時期球衣背号，第一个这么称呼他的是猛龙队解说员chuck swirsky。 赛场下的波什经常参加慈善活动和社区公益服务，由于自己童年的经历，他创建了基斯·保殊慈善基金会，协助提高达拉斯和多伦多青少年体育的教育事业，有规律的向青少年讲述阅读的好处。年休賽期中，邁阿密熱火隊宣佈裁掉克里斯·波什，結束年效力熱火的生涯。 但他尚未選擇退休。,Chinese


### Question 1. Describe the distribution of languages and give at least two comments about the dataset. (1 point)

In [11]:
# Describe the distribution of the languages

data

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
...,...,...
21995,hors du terrain les années et sont des année...,French
21996,ใน พศ หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเ...,Thai
21997,con motivo de la celebración del septuagésimoq...,Spanish
21998,年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由...,Chinese


In [12]:
data['Text'][0]

'klement gottwaldi surnukeha palsameeriti ning paigutati mausoleumi surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke  aastal viidi ta surnukeha mausoleumist ära ja kremeeriti zlíni linn kandis aastatel – nime gottwaldov ukrainas harkivi oblastis kandis zmiivi linn aastatel – nime gotvald'

In [13]:
data.language.value_counts()

Estonian      1000
Swedish       1000
English       1000
Russian       1000
Romanian      1000
Persian       1000
Pushto        1000
Spanish       1000
Hindi         1000
Korean        1000
Chinese       1000
French        1000
Portugese     1000
Indonesian    1000
Urdu          1000
Latin         1000
Turkish       1000
Japanese      1000
Dutch         1000
Tamil         1000
Thai          1000
Arabic        1000
Name: language, dtype: int64

The languages are distributed evenly, with 1000 sentences for each language. The dataset is small, and the sentences are short. The dataset is balanced, with 1000 sentences for each language.

### Question 2-5. Explore the data with your own preprocessing and train your model

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split

X=data['Text']
y=data['language']

# Split the data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModel.from_pretrained("xlm-roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["Text"], truncation=True)

from datasets import Dataset

train_dataset = Dataset.from_pandas(pd.DataFrame({"Text": X_train, "language": y_train}))
train_dataset = train_dataset.map(tokenize_function, batched=True)

test_dataset = Dataset.from_pandas(pd.DataFrame({"Text": X_test, "language": y_test}))
test_dataset = test_dataset.map(tokenize_function, batched=True)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import TextClassificationPipeline

pipeline = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    framework="pt",
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()},
)

trainer.train()

Downloading:  47%|████▋     | 4.31M/9.10M [00:08<00:04, 1.07MB/s]

KeyboardInterrupt: 

Downloading:  48%|████▊     | 4.41M/9.10M [00:19<00:04, 1.07MB/s]

## FastText for language detection

## FastText training setup 

In [None]:
! wget http://downloads.tatoeba.org/exports/sentences.tar.bz2

--2021-11-01 14:44:47--  http://downloads.tatoeba.org/exports/sentences.tar.bz2
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.tatoeba.org/exports/sentences.tar.bz2 [following]
--2021-11-01 14:44:47--  https://downloads.tatoeba.org/exports/sentences.tar.bz2
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 155856043 (149M) [application/octet-stream]
Saving to: ‘sentences.tar.bz2’


2021-11-01 14:44:54 (25.4 MB/s) - ‘sentences.tar.bz2’ saved [155856043/155856043]



In [None]:
! bunzip2 sentences.tar.bz2

In [None]:
! ls

dataset.csv  dataset.csv.zip  LICENSE  README.md  sentences.tar


In [None]:
! tar xvf sentences.tar

sentences.csv


In [None]:
! ls

dataset.csv  dataset.csv.zip  LICENSE  README.md  sentences.csv  sentences.tar


In [None]:
! head -10 sentences.csv

1	cmn	我們試試看！
2	cmn	我该去睡觉了。
3	cmn	你在干什麼啊？
4	cmn	這是什麼啊？
5	cmn	今天是６月１８号，也是Muiriel的生日！
6	cmn	生日快乐，Muiriel！
7	cmn	Muiriel现在20岁了。
8	cmn	密码是"Muiriel"。
9	cmn	我很快就會回來。
10	cmn	我不知道。


In [None]:
! awk -F"\t" '{print"__label__"$2" "$3}' < sentences.csv | shuf > all.txt

In [None]:
! ls

all.txt      dataset.csv.zip  README.md      sentences.tar
dataset.csv  LICENSE	      sentences.csv


In [None]:
! head -5 all.txt

__label__ber Ur bɣint ara ad d-aɣent asegzawal tafṛansit-takatalanit.
__label__epo Tom deziras riĉiĝi kaj famiĝi.
__label__deu Tom sagte, das sei bei weitem nicht gut genug.
__label__kab Amek i ilaq ara ad yekteb talɣa s tendunizit?
__label__epo Ĉu ni havas raspitan fromaĝon?


In [None]:
! head -n 10000 all.txt > valid.txt

In [None]:
! tail -n +10001 all.txt > train.txt

In [None]:
! head -5 train.txt

__label__asm তাৰ পইচাৰ দৰকাৰ হৈছে।
__label__tur O, bir dolap çeviriyor.
__label__tur Bazı genç kadınlar kötü oğlanları cezbediyor.
__label__por O Japão possui muitas atrações turísticas.
__label__ukr Я довідалася про Ваш портативний копір на Tokyo Office Expo 97.


### Question 6.1. Train fasttext model on Tatoeba (2 points)

In [None]:
%%time
import fasttext

# Check the fasttext library and implement the training
###########################################

# your implementation goes here

pass
################################################

# @TODO: Save your model when trained 
# model.save_model("langdetect.bin")

CPU times: user 12 µs, sys: 0 ns, total: 12 µs
Wall time: 15.3 µs


In [None]:
# Sanity check 
model.predict("I am French and I love English")

(('__label__eng',), array([1.00001001]))

### Question 6.2. Evaluate performance of fasttext model on valid.txt (1 point)

In [None]:
# Hint: Create dataframe from valid.txt and evaluate performance 

###########################################

# your implementation goes here

pass
################################################

### Question 7 & 8. Test your FastText model on the same dataset as question 5.

In [None]:
###########################################

# your implementation goes here

pass
################################################

In [None]:
###########################################

# your implementation goes here

pass
################################################