# Unsupervised Dictionary Methods

In [20]:
import pandas as pd
import numpy as np
from nltk import *
import string
import pickle
import re
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split

In [2]:
# Load our data and try
with open('modified_data/just_tags.txt', 'rb') as f:
    just_tags = pickle.load(f)
    
with open('modified_data/just_words.txt', 'rb') as f:
    just_words = pickle.load(f)

In [3]:
words_train, words_test, tags_train, tags_test = train_test_split(just_words, just_tags, random_state = 42, test_size = 0.2)

In [4]:
# Make hindi dictionary from training data becuase theres no phonetically typed hindi dictionary anywhere
hin_dict = [w for i, w in enumerate(words_train) if tags_train[i] == 'HI']

In [19]:
# How wordnets dictionary works:
# Case insensitive so its good for our data
if wordnet.synsets('glass'):
    print('ENGLISH')
else:
    print('NOT ENGLISH')

ENGLISH


In [10]:
# Just check if emoticons with alphabets are present in the training set
':P' in just_words

False

In [11]:
# How do emoticons look?
[w for i, w in enumerate(just_words) if just_tags[i] == 'EMT']

['😡',
 ':)',
 ';)',
 ':)',
 ';)',
 ':)',
 ':)',
 ':)',
 ':)',
 ':(',
 ';)',
 ':)',
 ':)',
 ';)',
 ';)',
 ':)',
 ':)',
 ':)',
 ':)',
 ':-)',
 '😜',
 ':(',
 ':)',
 ':)',
 ':(',
 ':)',
 ':-)',
 ':)',
 ':)',
 ':)',
 ':/',
 ':(',
 ';)',
 ':)',
 ':-(',
 ':/',
 ':@',
 ':@',
 ':(',
 ':-)',
 ';)',
 ':/',
 ':(',
 ':(',
 ';)',
 ':3',
 ':)',
 ':-)',
 ':(',
 ':)',
 ':)',
 ':)',
 ':)',
 ':)',
 ';)',
 ':)',
 ':-(',
 ':-)',
 ':)',
 ';)',
 ':-(',
 ':)',
 ':)',
 ':(',
 ':(',
 ':(',
 ':(',
 ';)',
 ':)',
 ':)',
 ':(',
 ':(',
 ':-(',
 ';)',
 ';)',
 ':)',
 ':)',
 ':(',
 ':)',
 ';)',
 ':)',
 ';)',
 ':)',
 ':)',
 '😡',
 ':)',
 ':)',
 ':)',
 ':)',
 ';)',
 ':(',
 ';)',
 ':-/',
 ':)',
 ':)',
 ':)',
 ':)',
 ':)',
 ':(',
 ':(',
 ':(',
 ':)',
 '😘',
 ';)',
 ':)',
 ':)',
 ':-)',
 ';)',
 ':)',
 ':)',
 ':)',
 ';)',
 ':)',
 ':)',
 ':)',
 ':/',
 ':-)',
 ':)',
 ';)',
 ':(',
 ':)',
 ':)',
 ':)',
 ':@',
 ':)',
 ';)',
 ':)',
 ':)',
 ':)',
 ':(',
 ':/',
 ':-(',
 ':)',
 ':-)',
 ':)',
 ':)',
 ':(',
 ':)',
 ':)',
 ';)',
 ';)',
 ':

In [13]:
# The 4 possible tags we need to classifiy for
np.unique(tags_train)

array(['EMT', 'EN', 'HI', 'UN'], dtype='<U3')

## Wordnet and Hindi Dictionary

In [14]:
# Classifier that uses wordnet's dictionary
def classify_dictionary(words):
    preds = []
    for w in words:
        if wordnet.synsets(w):
            preds.append('EN')
            
        elif re.search('[a-zA-Z]', w) == None:
            preds.append('EMT')
            
        elif w in hin_dict:
            preds.append('HI')
        
        else:
            preds.append('UN')
            
    return preds

In [15]:
preds_dict_train = classify_dictionary(words_train)
preds_dict_test = classify_dictionary(words_test)

In [33]:
def classification_acc(true_y, pred_y):
    classified = [True for i in range(len(true_y)) if true_y[i] == pred_y[i]]
    return (np.sum(classified)/len(true_y))*100

In [17]:
print ('Training classification accuracy using just dictionaries is:', classification_acc(tags_train, preds_dict_train), '%')
print ('Test classification accuracy using just dictionaries is:', classification_acc(tags_test, preds_dict_test), '%')

Training classification accuracy using just dictionaries is: 55.94523642547276 %
Test classification accuracy using just dictionaries is: 52.33224933594817 %


## Checking with the Twitter Dictionary 

In [5]:
# Load the twitter generated dictionary
with open('dictionaries/dictionary_twitter.txt', 'rb') as f:
    twitter_dict = pickle.load(f)

In [10]:
just_keys = list(twitter_dict.keys())

In [23]:
# Clean the keys of punctuations and links
invalid_chars = set(string.punctuation)

In [31]:
twitter_dict_eng = [w for w in just_keys if not any([True for i in w if i in invalid_chars])]

In [32]:
# Classifier that uses the Twitter dictionary dictionary
def classify_dictionary_twitter(words):
    preds = []
    for w in words:
        if w in twitter_dict_eng:
            preds.append('EN')
            
        elif re.search('[a-zA-Z]', w) == None:
            preds.append('EMT')
            
        elif w in hin_dict:
            preds.append('HI')
        
        else:
            preds.append('UN')
            
    return preds

In [34]:
preds_dict_train_tw = classify_dictionary_twitter(words_train)
preds_dict_test_tw = classify_dictionary_twitter(words_test)

In [35]:
print ('Training classification accuracy using Twitter dictionaries is:', classification_acc(tags_train, preds_dict_train_tw), '%')
print ('Test classification accuracy using Twitter dictionaries is:', classification_acc(tags_test, preds_dict_test_tw), '%')

Training classification accuracy using Twitter dictionaries is: 60.810282444500984 %
Test classification accuracy using Twitter dictionaries is: 56.6591366084949 %


Accuracy has slightly improved with a dictionary more suitable for the task.