# Building the Classifier

## 1. Loading the corpus

In [16]:
import re
from nltk import sent_tokenize

cg_sents = []
smg_sents = []

def remove_duplicate_punctuation(s): # sent_tokenize() gets confused when there's duplicate punctuation 
    return(re.sub(r'(\.|\?|!)\1+', r'\1', s))
    
with open('./Data/cg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p] # sent_tokenize() doesn't consider a new line a new sentence so this is required.
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)
    
with open('./Data/cg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)

with open('./Data/smg_twitter.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_fb.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)
    
with open('./Data/smg_other.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)

cg_sents[:3]

['Πρασινο αυκουι μες το πασχαλινο ποτήρι που έπιασε ο μιτσης #αισχος 🤣🤣🤣   @HARRIS_APOEL https://t.co/y9X7CmBEa5',
 '@HARRIS_APOEL @pirpoitis @vassrules Καμνουν ανακαινιση στα Περβολια φετος.',
 '@MUFCChristian Ελα συγγενη τζιαι εχουμε νεοτερα π το Νικολη.']

## 2. Cleaning the text

In [17]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def get_clean_sent_el(sentence):
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'\&\w*;', '', sentence)
    sentence = re.sub(r'\@\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
        if token == 'ο,τι' or token == 'ό,τι' or token == 'o,ti' or token == 'ó,ti':
            new_tokens.append(token)
        else:
            new_tokens.append(token.translate(str.maketrans({key: None for key in punctuation})))
    sentence =' '.join(new_tokens)
    sentence = sentence.strip(' ') # performs lstrip() and rstrip()
    return sentence.lower()

cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

# Remove empty strings left due to sentences ending up being only URLs then getting deleted on cleaning:
cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))
cg_sents_clean[:3]

['πρασινο αυκουι μες το πασχαλινο ποτηρι που επιασε ο μιτσης',
 'καμνουν ανακαινιση στα περβολια φετος',
 'ελα συγγενη τζιαι εχουμε νεοτερα π το νικολη']

## 3. Tokenization

In [18]:
cg_sents_tokens = []
smg_sents_tokens = []

for sent in cg_sents_clean:
    cg_sents_tokens.append(WhitespaceTokenizer().tokenize(sent))
for sent in smg_sents_clean:
    smg_sents_tokens.append(WhitespaceTokenizer().tokenize(sent))
    
cg_sents_tokens[:3]

[['πρασινο',
  'αυκουι',
  'μες',
  'το',
  'πασχαλινο',
  'ποτηρι',
  'που',
  'επιασε',
  'ο',
  'μιτσης'],
 ['καμνουν', 'ανακαινιση', 'στα', 'περβολια', 'φετος'],
 ['ελα', 'συγγενη', 'τζιαι', 'εχουμε', 'νεοτερα', 'π', 'το', 'νικολη']]

## 4. Building the feature extractor

In [19]:
from nltk import ngrams

def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for i in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    word = re.sub(r'ς', 'σ', word)
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))
    
    # Removing redundant ngrams:
    if (n > 2):
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]
    
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for i in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

In [20]:
# Feature extractor
def get_ngram_features(sentence_tokens):
    features = {}
    
    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second paramter to .get() is a default value if the key doesn't exist.
    
    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1
    
    # Word trigrams
    ngrams = get_word_ngrams(sentence_tokens, 3)
    for ngram in ngrams:
        features[f'word_trigram({ngram})'] = features.get(f'word_trigram({ngram})', 0) + 1
    
    # Word quadrigrams
    ngrams = get_word_ngrams(sentence_tokens, 4)
    for ngram in ngrams:
        features[f'word_quadrigram({ngram})'] = features.get(f'word_quadrigram({ngram})', 0) + 1
    
    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_word_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1
    
    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1
    
    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1
    
    # Char quadrigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 4)
        for ngram in ngrams:
            features[f'char_quadrigram({ngram})'] = features.get(f'char_quadrigram({ngram})', 0) + 1
    
    return features

get_ngram_features(['αυτη', 'ειναι', 'η', 'σπαρτη'])

{'char(α)': 3,
 'char(ε)': 1,
 'char(η)': 3,
 'char(ι)': 2,
 'char(ν)': 1,
 'char(π)': 1,
 'char(ρ)': 1,
 'char(σ)': 1,
 'char(τ)': 2,
 'char(υ)': 1,
 'char_bigram(_α)': 1,
 'char_bigram(_ε)': 1,
 'char_bigram(_η)': 1,
 'char_bigram(_σ)': 1,
 'char_bigram(αι)': 1,
 'char_bigram(αρ)': 1,
 'char_bigram(αυ)': 1,
 'char_bigram(ει)': 1,
 'char_bigram(η_)': 3,
 'char_bigram(ι_)': 1,
 'char_bigram(ιν)': 1,
 'char_bigram(να)': 1,
 'char_bigram(πα)': 1,
 'char_bigram(ρτ)': 1,
 'char_bigram(σπ)': 1,
 'char_bigram(τη)': 2,
 'char_bigram(υτ)': 1,
 'char_quadrigram(_αυτ)': 1,
 'char_quadrigram(_ειν)': 1,
 'char_quadrigram(_σπα)': 1,
 'char_quadrigram(αρτη)': 1,
 'char_quadrigram(αυτη)': 1,
 'char_quadrigram(εινα)': 1,
 'char_quadrigram(ιναι)': 1,
 'char_quadrigram(ναι_)': 1,
 'char_quadrigram(παρτ)': 1,
 'char_quadrigram(ρτη_)': 1,
 'char_quadrigram(σπαρ)': 1,
 'char_quadrigram(υτη_)': 1,
 'char_trigram(_αυ)': 1,
 'char_trigram(_ει)': 1,
 'char_trigram(_η_)': 1,
 'char_trigram(_σπ)': 1,
 'char_trig

## 5. Labeling the sentences

In [21]:
# cg_sents_features_labeled = [(get_ngram_features(word), 'cg') for word in cg_sents_tokens]
# smg_sents_features_labeled = [(get_ngram_features(word), 'smg') for word in smg_sents_tokens]

# all_sents_features_labeled = cg_sents_features_labeled + smg_sents_features_labeled
# all_sents_features_labeled[0]

all_sents_labeled = ([(sentence, 'cg') for sentence in cg_sents_tokens] + [(sentence, 'smg') for sentence in smg_sents_tokens])
all_sents_labeled[0]                                      

(['πρασινο',
  'αυκουι',
  'μες',
  'το',
  'πασχαλινο',
  'ποτηρι',
  'που',
  'επιασε',
  'ο',
  'μιτσης'],
 'cg')

## 6. Splitting corpus into training and test data

In [22]:
import random
from nltk.classify import apply_features

random.shuffle(all_sents_labeled)

NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

print('DATASET\t', 'SENTENCES')
print('All\t', NO_ALL_SENTENCES)
print('Training', NO_TRAIN_SENTENCES)
print('Testing\t', NO_ALL_SENTENCES - NO_TRAIN_SENTENCES)

train_set = apply_features(get_ngram_features, all_sents_labeled[:NO_TRAIN_SENTENCES])
test_set = apply_features(get_ngram_features, all_sents_labeled[NO_TRAIN_SENTENCES:])

DATASET	 SENTENCES
All	 153
Training 122
Testing	 31


## 7. Building a Naive Bayes Classifier

In [23]:
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)

  """


In [25]:
classifier.labels()

['cg', 'smg']

In [26]:
from nltk.classify import accuracy

round(accuracy(classifier, test_set), 2)

  """


0.68

In [27]:
classifier.show_most_informative_features(10)

Most Informative Features
   char_quadrigram(εινα) = 1                 smg : cg     =     11.9 : 1.0
       char_trigram(ρωτ) = 1                 smg : cg     =     11.9 : 1.0
                 char(η) = 4                 smg : cg     =     11.1 : 1.0
   char_quadrigram(ναι_) = 1                 smg : cg     =     10.1 : 1.0
                 char(χ) = 2                 smg : cg     =     10.1 : 1.0
       char_trigram(ναι) = 1                 smg : cg     =     10.0 : 1.0
         char_bigram(ωτ) = 1                 smg : cg     =      8.3 : 1.0
   char_quadrigram(σαι_) = 1                 smg : cg     =      8.3 : 1.0
       char_trigram(ελ_) = 1                 smg : cg     =      8.3 : 1.0
       char_trigram(εχε) = 1                 smg : cg     =      8.3 : 1.0
