# ML4NLP1
## Starting Point for Exercise 1, part II

This notebook is supposed to serve as a starting point and/or inspiration when starting exercise 1, part II.

One of the goals of this exercise is o make you acquainted with **skorch**. You will probably need to consult the [documentation](https://skorch.readthedocs.io/en/stable/).

# Installing skorch and loading libraries

In [1]:
import subprocess

# Installation on Google Colab
try:
    import google.colab
    subprocess.run(['python', '-m', 'pip', 'install', 'skorch'])
except ImportError:
    pass

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from skorch import NeuralNetClassifier

In [3]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [4]:
import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict

## Training a classifier and making predictions

In [5]:
# download dataset
!gdown 1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs # x_train
!gdown 1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6 # x_test
!gdown 1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl # y_train
!gdown 1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X # y_test

Downloading...
From: https://drive.google.com/uc?id=1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs
To: /content/x_train.txt
100% 64.1M/64.1M [00:00<00:00, 70.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6
To: /content/x_test.txt
100% 65.2M/65.2M [00:00<00:00, 119MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl
To: /content/y_train.txt
100% 480k/480k [00:00<00:00, 66.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X
To: /content/y_test.txt
100% 480k/480k [00:00<00:00, 60.2MB/s]


In [6]:
with open(f'x_train.txt') as f:
    x_train = f.read().splitlines()
with open(f'y_train.txt') as f:
    y_train = f.read().splitlines()
with open(f'x_test.txt') as f:
    x_test = f.read().splitlines()
with open(f'y_test.txt') as f:
    y_test = f.read().splitlines()

In [7]:
import pandas as pd
# combine x_train and y_train into one dataframe
train_df = pd.DataFrame({'text': x_train, 'label': y_train})

#combine x_test and y_test into one dataframe
test_df = pd.DataFrame({'text': x_test, 'label': y_test})

#combine both train_df and test_df into one dataframe for further data split according to 80/20 rule
total_df = pd.concat([train_df, test_df])

print(train_df.shape)
print(test_df.shape)
print(total_df.shape)

(117500, 2)
(117500, 2)
(235000, 2)


In [None]:
# T: Please use again the train/test data that includes English, German, Dutch, Danish, Swedish and Norwegian, plus 20 additional languages of your choice (the labels can be found in the file labels.csv)
# and adjust the train/test split if needed

In [8]:
# get list of all labels
labels = train_df['label'].unique().tolist()
print(labels)

['est', 'swe', 'mai', 'oci', 'tha', 'orm', 'lim', 'guj', 'pnb', 'zea', 'krc', 'hat', 'pcd', 'tam', 'vie', 'pan', 'szl', 'ckb', 'fur', 'wuu', 'arz', 'ton', 'eus', 'map-bms', 'glk', 'nld', 'bod', 'jpn', 'arg', 'srd', 'ext', 'sin', 'kur', 'che', 'tuk', 'pag', 'tur', 'als', 'koi', 'lat', 'urd', 'tat', 'bxr', 'ind', 'kir', 'zh-yue', 'dan', 'por', 'fra', 'ori', 'nob', 'jbo', 'kok', 'amh', 'khm', 'hbs', 'slv', 'bos', 'tet', 'zho', 'kor', 'sah', 'rup', 'ast', 'wol', 'bul', 'gla', 'msa', 'crh', 'lug', 'sun', 'bre', 'mon', 'nep', 'ibo', 'cdo', 'asm', 'grn', 'hin', 'mar', 'lin', 'ile', 'lmo', 'mya', 'ilo', 'csb', 'tyv', 'gle', 'nan', 'jam', 'scn', 'be-tarask', 'diq', 'cor', 'fao', 'mlg', 'yid', 'sme', 'spa', 'kbd', 'udm', 'isl', 'ksh', 'san', 'aze', 'nap', 'dsb', 'pam', 'cym', 'srp', 'stq', 'tel', 'swa', 'vls', 'mzn', 'bel', 'lad', 'ina', 'ava', 'lao', 'min', 'ita', 'nds-nl', 'oss', 'kab', 'pus', 'fin', 'snd', 'kaa', 'fas', 'cbk', 'cat', 'nci', 'mhr', 'roa-tara', 'frp', 'ron', 'new', 'bar', 'ltg'

In [9]:
# Ranodomly choose 20 additional languages
# As a dataset composed of 27 languages would cause the session to crash
# due to ram limit. So I only use 7 languages as my dataset.
import random
random.seed(20)

languages = ['eng', 'deu', 'nld', 'dan', 'swe', 'nob', 'jpn']
#for i in range(20):
  #languages.append(random.choice(labels))
print(languages)

subset_df = total_df[total_df['label'].isin(languages)]
subset_df

['eng', 'deu', 'nld', 'dan', 'swe', 'nob', 'jpn']


Unnamed: 0,text,label
1,"Sebes, Joseph; Pereira Thomas (1961) (på eng)....",swe
26,De spons behoort tot het geslacht Haliclona en...,nld
29,エノが行きがかりでバスに乗ってしまい、気分が悪くなった際に助けるが、今すぐバスを降りたいと運...,jpn
46,シャーリー・フィールドは、サン・ベルナルド・アベニュー沿い市民センターとR&Tマーティン高校...,jpn
52,Indtil 1545 havde flådecheferne kunnet hyre et...,dan
...,...,...
117156,Het gewone volk leeft in vrede en nog altijd w...,nld
117196,På midten av 1980-tallet spilte han sammen med...,nob
117202,"katholische Ortskirche St. Nikolaus (1955), be...",deu
117308,Mozilla Add-ons は、Mozilla Firefox、Mozilla Thun...,jpn


In [10]:
text_df = subset_df['text']
label_df = subset_df['label']
text_df.shape, label_df.shape

((7000,), (7000,))

In [28]:
# Split the data according to 80/20 rule
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_df, label_df, test_size=0.20, random_state=42)

In [29]:
print('Training set shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test set shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Training set shape:  (5600,)
Training labels shape:  (5600,)
Test set shape:  (1400,)
Test labels shape:  (1400,)


In [30]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC

In [None]:
# T: use your adjusted code to encode the labels here

In [31]:
# Preprocessing
label_encoder = LabelEncoder()
le_fitted = label_encoder.fit(y_train)

In [32]:
le_fitted.classes_

array(['dan', 'deu', 'eng', 'jpn', 'nld', 'nob', 'swe'], dtype=object)

In [26]:
# T: In the following, you can find a small (almost) working example of a neural network. Unfortunately, again, the cat messed up some of the code. Please fix the code such that it is executable.

In [33]:
num_classes = len(le_fitted.classes_)
num_classes

7

In [34]:
y_train = le_fitted.transform(y_train)
y_test = le_fitted.transform(y_test)

In [35]:
y_train = y_train.astype(np.int64)
y_train

array([0, 4, 3, ..., 1, 1, 6])

In [36]:
class CountVectorizerWrapper:
    def __init__(self, analyzer, ngram_range, max_features, binary):
        print('args:', str([ngram_range, max_features]))
        self.countvec = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range,
                                        max_features=max_features, binary=binary)

    def fit(self, X, y=None):
        self.countvec.fit(X)
        return self

    def transform(self, X, y=None):
        return self.countvec.transform(X).astype(np.float32)

In the following, we define a vanilla neural network with two hidden layers. The output layer should have as many outputs as there are classes. In addition, it should have a nonlinearity function.

In [37]:
class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_units=200,
            nonlin=F.relu,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(100, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 50)
        self.output = nn.Linear(50, num_classes)

    def forward(self, X, **kwargs):
      X = self.nonlin(self.dense0(X))
      X = F.relu(self.dense1(X))
      X = self.output(X)
      return X.squeeze(dim=1)

In [46]:
net = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    #device='cuda',  # comment this to train with CPU
)

In [47]:
from sklearn.pipeline import Pipeline

In [48]:
# First, we extract some simple features as input for the neural network
vec_args = dict(analyzer='char', ngram_range=(2, 2), max_features=100, binary=False)
pipe = Pipeline(steps=[
        ('Vectorizer', CountVectorizerWrapper(**vec_args)),
        ('net', net)
    ], verbose=True)

args: [(2, 2), 100]


In [49]:
pipe.fit(X_train, y_train)

[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   2.9s
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.2135[0m       [32m0.8607[0m        [35m0.6884[0m  1.0316
      2        [36m0.7308[0m       [32m0.8911[0m        [35m0.5141[0m  1.0607
      3        [36m0.5202[0m       [32m0.9080[0m        [35m0.4094[0m  1.0096
      4        [36m0.4284[0m       [32m0.9152[0m        [35m0.3456[0m  0.7812
      5        [36m0.3520[0m       [32m0.9187[0m        [35m0.2945[0m  0.6710
      6        [36m0.2693[0m       0.9170        [35m0.2600[0m  0.6687
      7        [36m0.2287[0m       [32m0.9214[0m        [35m0.2338[0m  0.6370
      8        0.2547       [32m0.9286[0m        0.2349  0.6807
      9        0.2408       [32m0.9295[0m        0.2403  0.6560
     10        [36m0.2224[0m       0.9268        [35m0.2270[0m  0.6275
[Pipeline] ............... (s

In [76]:
from skorch.callbacks.training import EarlyStopping
from sklearn.model_selection import GridSearchCV

# deactivate skorch-internal train-valid split and verbose logging
net.set_params(train_split=False, verbose=0)
params = {
    'net__lr': [0.1, 0.01, 0.2],
    'net__predict_nonlinearity': [F.relu, F.tanh, F.sigmoid],
    'net__max_epochs': [10, 20],
    #'net__callbacks': [('early_stopping', EarlyStopping())]
}

In [77]:
vec_args = dict(analyzer='char', ngram_range=(2, 2), max_features=100, binary=False)
pipe = Pipeline(steps=[
        ('Vectorizer', CountVectorizerWrapper(**vec_args)),
        ('net', net)
    ], verbose=True)

args: [(2, 2), 100]


In [78]:
grid_net = GridSearchCV(pipe, params, refit=False, cv=2, scoring='accuracy')

In [79]:
grid_net.fit(X_train, y_train)

[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   1.2s
[Pipeline] ............... (step 2 of 2) Processing net, total=   3.6s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   1.1s
[Pipeline] ............... (step 2 of 2) Processing net, total=   3.6s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   1.5s
[Pipeline] ............... (step 2 of 2) Processing net, total=   3.7s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   1.1s
[Pipeline] ............... (step 2 of 2) Processing net, total=   3.6s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   1.6s
[Pipeline] ............... (step 2 of 2) Processing net, total=   3.4s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   1.1s
[Pipeline] ............... (step 2 of 2) Processing net, total=   3.4s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   2.2s
[Pipeline] ............... (step 2 of 2) Processing net, total=   6.5s
[Pipel

In [80]:
print(grid_net.best_score_, grid_net.best_params_)

0.9094642857142857 {'net__lr': 0.2, 'net__max_epochs': 10, 'net__predict_nonlinearity': <function relu at 0x7b265dbaca60>}


In [81]:
grid_net.fit(X_test, y_test)

[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   0.4s
[Pipeline] ............... (step 2 of 2) Processing net, total=   0.9s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   0.3s
[Pipeline] ............... (step 2 of 2) Processing net, total=   0.9s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   0.3s
[Pipeline] ............... (step 2 of 2) Processing net, total=   0.9s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   0.3s
[Pipeline] ............... (step 2 of 2) Processing net, total=   0.9s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   0.5s
[Pipeline] ............... (step 2 of 2) Processing net, total=   1.3s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   0.6s
[Pipeline] ............... (step 2 of 2) Processing net, total=   0.8s
[Pipeline] ........ (step 1 of 2) Processing Vectorizer, total=   0.3s
[Pipeline] ............... (step 2 of 2) Processing net, total=   1.9s
[Pipel

In [82]:
print(grid_net.best_score_, grid_net.best_params_)

0.8628571428571428 {'net__lr': 0.1, 'net__max_epochs': 20, 'net__predict_nonlinearity': <function relu at 0x7b265dbaca60>}
