# NLP FIRST ASSIGNMENT
Developing a Naïve Bayas Classifier able to distinguish between english and not-english

## Preliminary imports

In [2]:
import nltk 
import random 
import time
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')

ROOT = '/Users/sebastianodarconso/Desktop/magistrale_lab/natural_language_processing/first_assingnment/europarl_raw'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sebastianodarconso/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Extracting english and not-english files from the europarl_raw dataset

In [3]:
eng = open(ROOT + "/english/ep-00-01-17.en", 'r').read()
eng += open(ROOT + "/english/ep-00-01-18.en", 'r').read()
eng += open(ROOT + "/english/ep-00-01-19.en", 'r').read()
eng += open(ROOT + "/english/ep-00-01-21.en", 'r').read()
eng += open(ROOT + "/english/ep-00-02-02.en", 'r').read()
eng += open(ROOT + "/english/ep-00-02-03.en", 'r').read()

not_eng = open(ROOT + "/german/ep-00-01-17.de", 'r').read()
not_eng += open(ROOT + "/french/ep-00-01-17.fr", 'r').read()
not_eng += open(ROOT + "/finnish/ep-00-01-17.fi", 'r').read()
not_eng += open(ROOT + "/greek/ep-00-01-17.el", 'r').read()
not_eng += open(ROOT + "/italian/ep-00-01-17.it", 'r').read()
not_eng += open(ROOT + "/swedish/ep-00-01-17.sv", 'r').read()
not_eng += open(ROOT + "/dutch/ep-00-01-18.nl", 'r').read()

In [4]:
print(len(eng))
print(len(not_eng))

1400752
1503692


# Creating the sets of stopwords for the languages used 
For this example the languages are:
- english
- german 
- dutch
- finnish
- italian
- swedish 
- french 
- greek 

They will still be divided as "english" and "not english"

In [5]:
languages = ['english', 'german', 'dutch', 'finnish', 'italian', 'swedish', 'french', 'greek']

In [6]:
stopwords_eng = set()
stopwords_not_eng = set()
for l in languages:
    if l == 'english':
        stopwords_eng.update(stopwords.words(l))
    else:
        stopwords_not_eng.update(stopwords.words(l))

## Creating the stemmer (PorterStemmer)

In [7]:
ps = PorterStemmer()

## Tokenizing, stemming and removing stopwords

In this section all the words in both english and not english bows will be tokenized and stemmed. From the resulting bows will be removed also the stopwords and then they will be merged together in order to create an heterogeneous (with relation to the language) bag of words.

In [8]:
all_words = []
documents = []

start = time.time()
for en in tqdm(eng.split('\n')):
    documents.append((en, 'eng'))
    words = word_tokenize(en)
    for w in words:
        if not w in stopwords_eng:
            all_words.append(ps.stem(w.lower()))

for ne in tqdm(not_eng.split('\n')):
    documents.append((ne, 'not eng'))
    words = word_tokenize(ne)
    for w in words:
        if w not in stopwords_not_eng:
            all_words.append(ps.stem(w.lower()))
end = time.time()

print(end - start)

100%|██████████| 9135/9135 [00:01<00:00, 4995.50it/s]
100%|██████████| 9637/9637 [00:01<00:00, 5111.29it/s]

3.7317121028900146





## Calculating the frequency distribution for the BOW and listing the first 8k features

In [9]:
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:8000]

## Defining a function that extract the features from each document (all the words, tokenized)

In [10]:
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features 

## Creating the dataset and shuffling it

In [11]:
featureset = [(find_features(doc), lang) for (doc, lang) in tqdm(documents)]
random.shuffle(featureset)
print(len(featureset))

KeyboardInterrupt: 

## Splitting (evenly) the dataset into training and testing data

In [None]:
dataset_len = len(featureset)

testing_set = featureset[(dataset_len//2):]
training_set = featureset[:(dataset_len//2)]

## Training the default Naïve Bayas Classifier on the training dataset

In [None]:
start = time.time()
classifier = nltk.NaiveBayesClassifier.train(training_set)
end = time.time()

print(end - start)

29.3425772190094


## Defining the function that will be used to test the classifier on text and files different from the testing ones

In [None]:
def test(text):
    feats = find_features(text)
    return classifier.classify(feats)

## Some tests

In [None]:
text = "oggi è proprio una bella giornata"
text_eng = "today is such a beautiful day"
text_de = "Mein Name ist Anna. Ich komme aus Österreich und lebe seit drei Jahren in Deutschland. Ich bin 15Jahre alt und habe zwei Geschwister: Meine Schwester heißt Klara und ist 13 Jahre alt, mein BruderMichael ist 18 Jahre alt."
text_el = "Το όνομά μου είναι Άννα. Κατάγομαι από την Αυστρία και ζω στη Γερμανία εδώ και τρία χρόνια. Είμαι 15 χρονών και έχω δύο αδέρφια: Η αδερφή μου ονομάζεται Κλάρα και είναι 13 ετών, ο αδερφός μου ο Μιχαήλ είναι 18 ετών. Ζούμε με τους γονείς μας σε ένα σπίτι κοντά στο Μόναχο, η μητέρα μου είναι μαγείρισσα και ο πατέρας μου σε τράπεζα"
print(text + ': ' + test(text) + '\n')
print(text_eng + ': ' + test(text_eng) + '\n')
print(text_de + ': '+ test(text_de) + '\n')
print(text_el + ': '+ test(text_el) + '\n')

oggi è proprio una bella giornata: not eng

today is such a beautiful day: eng

Mein Name ist Anna. Ich komme aus Österreich und lebe seit drei Jahren in Deutschland. Ich bin 15Jahre alt und habe zwei Geschwister: Meine Schwester heißt Klara und ist 13 Jahre alt, mein BruderMichael ist 18 Jahre alt.: not eng

Το όνομά μου είναι Άννα. Κατάγομαι από την Αυστρία και ζω στη Γερμανία εδώ και τρία χρόνια. Είμαι 15 χρονών και έχω δύο αδέρφια: Η αδερφή μου ονομάζεται Κλάρα και είναι 13 ετών, ο αδερφός μου ο Μιχαήλ είναι 18 ετών. Ζούμε με τους γονείς μας σε ένα σπίτι κοντά στο Μόναχο, η μητέρα μου είναι μαγείρισσα και ο πατέρας μου σε τράπεζα: not eng



## Displaying the confusion matrix with the testing set

In [None]:
from nltk.metrics import ConfusionMatrix
from collections import defaultdict

ref = defaultdict(set)
testset = defaultdict(set)

labels = []
tests = []

for i, (feats, label) in enumerate(tqdm(testing_set)):
    ref[label].add(i)
    observed = classifier.classify(feats)
    testset[observed].add(i)
    labels.append(label)
    tests.append(observed)


cm = ConfusionMatrix(labels, tests)
print(cm)

        |         n |
        |         o |
        |         t |
        |           |
        |    e    e |
        |    n    n |
        |    g    g |
--------+-----------+
    eng |<4382> 173 |
not eng |    .<4831>|
--------+-----------+
(row = reference; col = test)



## Displaying the precision, recall and F-measure on the testing data

In [None]:
print(cm.evaluate())

    Tag | Prec.  | Recall | F-measure
--------+--------+--------+-----------
    eng | 1.0000 | 0.9620 | 0.9806
not eng | 0.9654 | 1.0000 | 0.9824



## Classifier accuracy and most informative features

In [None]:
classifier.show_most_informative_features(15)
print("accuracy: {}".format(nltk.classify.accuracy(classifier, training_set)))

Most Informative Features
                     and = True              eng : not en =    872.8 : 1.0
                      en = True           not en : eng    =    683.3 : 1.0
                     not = True              eng : not en =    572.6 : 1.0
                      be = True              eng : not en =    402.3 : 1.0
                     die = True           not en : eng    =    379.6 : 1.0
                      la = True           not en : eng    =    256.4 : 1.0
                       l = True           not en : eng    =    191.6 : 1.0
                      le = True           not en : eng    =    183.9 : 1.0
                      or = True              eng : not en =    175.9 : 1.0
                      de = True           not en : eng    =    140.6 : 1.0
                     den = True           not en : eng    =    136.3 : 1.0
                       d = True           not en : eng    =    102.6 : 1.0
                       i = True           not en : eng    =     98.9 : 1.0