# Unsupervised Dictionary Methods

In [142]:
import pandas as pd
import numpy as np
import json
from nltk import *
import string
import pickle
import urllib
import re
from nltk.util import ngrams
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
import collections

In [37]:
# Load our data and try
with open('modified_data/just_tags.txt', 'rb') as f:
    just_tags = pickle.load(f)
    
with open('modified_data/just_words.txt', 'rb') as f:
    just_words = pickle.load(f)

In [38]:
words_train, words_test, tags_train, tags_test = train_test_split(just_words, just_tags, random_state = 42, test_size = 0.2)

In [39]:
# Make hindi dictionary from training data becuase theres no phonetically typed hindi dictionary anywhere
hin_dict = [w for i, w in enumerate(words_train) if tags_train[i] == 'HI']

In [40]:
# How wordnets dictionary works:
# Case insensitive so its good for our data
if wordnet.synsets('glass'):
    print('ENGLISH')
else:
    print('NOT ENGLISH')

ENGLISH


In [41]:
?wordnet.synsets

In [10]:
# Just check if emoticons with alphabets are present in the training set
':P' in just_words

False

In [11]:
# How do emoticons look?
[w for i, w in enumerate(just_words) if just_tags[i] == 'EMT']

['😡',
 ':)',
 ';)',
 ':)',
 ';)',
 ':)',
 ':)',
 ':)',
 ':)',
 ':(',
 ';)',
 ':)',
 ':)',
 ';)',
 ';)',
 ':)',
 ':)',
 ':)',
 ':)',
 ':-)',
 '😜',
 ':(',
 ':)',
 ':)',
 ':(',
 ':)',
 ':-)',
 ':)',
 ':)',
 ':)',
 ':/',
 ':(',
 ';)',
 ':)',
 ':-(',
 ':/',
 ':@',
 ':@',
 ':(',
 ':-)',
 ';)',
 ':/',
 ':(',
 ':(',
 ';)',
 ':3',
 ':)',
 ':-)',
 ':(',
 ':)',
 ':)',
 ':)',
 ':)',
 ':)',
 ';)',
 ':)',
 ':-(',
 ':-)',
 ':)',
 ';)',
 ':-(',
 ':)',
 ':)',
 ':(',
 ':(',
 ':(',
 ':(',
 ';)',
 ':)',
 ':)',
 ':(',
 ':(',
 ':-(',
 ';)',
 ';)',
 ':)',
 ':)',
 ':(',
 ':)',
 ';)',
 ':)',
 ';)',
 ':)',
 ':)',
 '😡',
 ':)',
 ':)',
 ':)',
 ':)',
 ';)',
 ':(',
 ';)',
 ':-/',
 ':)',
 ':)',
 ':)',
 ':)',
 ':)',
 ':(',
 ':(',
 ':(',
 ':)',
 '😘',
 ';)',
 ':)',
 ':)',
 ':-)',
 ';)',
 ':)',
 ':)',
 ':)',
 ';)',
 ':)',
 ':)',
 ':)',
 ':/',
 ':-)',
 ':)',
 ';)',
 ':(',
 ':)',
 ':)',
 ':)',
 ':@',
 ':)',
 ';)',
 ':)',
 ':)',
 ':)',
 ':(',
 ':/',
 ':-(',
 ':)',
 ':-)',
 ':)',
 ':)',
 ':(',
 ':)',
 ':)',
 ';)',
 ';)',
 ':

In [13]:
# The 4 possible tags we need to classifiy for
np.unique(tags_train)

array(['EMT', 'EN', 'HI', 'UN'], dtype='<U3')

## Wordnet and Hindi Dictionary

In [14]:
# Classifier that uses wordnet's dictionary
def classify_dictionary(words):
    preds = []
    for w in words:
        if wordnet.synsets(w):
            preds.append('EN')
            
        elif re.search('[a-zA-Z]', w) == None:
            preds.append('EMT')
            
        elif w in hin_dict:
            preds.append('HI')
        
        else:
            preds.append('UN')
            
    return preds

In [15]:
preds_dict_train = classify_dictionary(words_train)
preds_dict_test = classify_dictionary(words_test)

In [33]:
def classification_acc(true_y, pred_y):
    classified = [True for i in range(len(true_y)) if true_y[i] == pred_y[i]]
    return (np.sum(classified)/len(true_y))*100

In [17]:
print ('Training classification accuracy using just dictionaries is:', classification_acc(tags_train, preds_dict_train), '%')
print ('Test classification accuracy using just dictionaries is:', classification_acc(tags_test, preds_dict_test), '%')

Training classification accuracy using just dictionaries is: 55.94523642547276 %
Test classification accuracy using just dictionaries is: 52.33224933594817 %


## Checking with the Twitter Dictionary 

In [5]:
# Load the twitter generated dictionary
with open('dictionaries/dictionary_twitter.txt', 'rb') as f:
    twitter_dict = pickle.load(f)

In [10]:
just_keys = list(twitter_dict.keys())

In [23]:
# Clean the keys of punctuations and links
invalid_chars = set(string.punctuation)

In [31]:
twitter_dict_eng = [w for w in just_keys if not any([True for i in w if i in invalid_chars])]

In [32]:
# Classifier that uses the Twitter dictionary dictionary
def classify_dictionary_twitter(words):
    preds = []
    for w in words:
        if w in twitter_dict_eng:
            preds.append('EN')
            
        elif re.search('[a-zA-Z]', w) == None:
            preds.append('EMT')
            
        elif w in hin_dict:
            preds.append('HI')
        
        else:
            preds.append('UN')
            
    return preds

In [34]:
preds_dict_train_tw = classify_dictionary_twitter(words_train)
preds_dict_test_tw = classify_dictionary_twitter(words_test)

In [35]:
print ('Training classification accuracy using Twitter dictionaries is:', classification_acc(tags_train, preds_dict_train_tw), '%')
print ('Test classification accuracy using Twitter dictionaries is:', classification_acc(tags_test, preds_dict_test_tw), '%')

Training classification accuracy using Twitter dictionaries is: 60.810282444500984 %
Test classification accuracy using Twitter dictionaries is: 56.6591366084949 %


Accuracy has slightly improved with a dictionary more suitable for the task.

## Char n-grams

In [4]:
link = 'http://dasdipankar.com/ICON_NLP_Tool_Contest_2017/HI-EN.json'
js = urllib.request.urlopen(link)

js_raw = js.readlines()

js_str = [by.decode('utf-8') for by in js_raw]

js_as_json = json.loads(''.join(js_str))

In [146]:
only_text = [str(i['text']).strip() for i in js_as_json]
text_corpus = ''
for entry in only_text:
    text_corpus += entry + ''

In [148]:
def generate_ngram(text_corpus, n):
    ngram = word2ngrams(text_corpus, n)
    return ngram

In [107]:
def word2ngrams(text, n, exact=True):
    """ Convert text into character ngrams. """
    return [text[i:i+n] for i in range(len(text)-n+1)]

In [163]:
char_bigrams = generate_ngram(text_corpus, 2)
char_trigrams = generate_ngram(text_corpus, 3)
char_quadgrams = generate_ngram(text_corpus, 4)
char_pentagrams = generate_ngram(text_corpus, 5)

In [170]:
char_combined = char_bigrams + char_trigrams + char_quadgrams + char_pentagrams

In [171]:
combined_dict = collections.Counter(char_combined)
combined_dict = sorted(combined_dict.items(), key=operator.itemgetter(1), reverse=True)

In [177]:
combined_dict[:500]

[('e ', 25454),
 ('i ', 16448),
 ('a ', 15642),
 ('  ', 15446),
 ('ha', 13922),
 (' .', 13561),
 (' k', 12961),
 ('. ', 12898),
 (' h', 12561),
 (' t', 12379),
 (' a', 10800),
 (' . ', 10779),
 (' b', 10481),
 (' s', 9944),
 ('t ', 9935),
 ('n ', 9883),
 ('.  ', 9424),
 ('an', 9339),
 (' .  ', 9319),
 ('in', 9178),
 ('o ', 9066),
 ('r ', 9006),
 ('ar', 8921),
 ('s ', 8879),
 (' m', 8581),
 ('th', 7637),
 ('ai', 7570),
 ('  .', 7504),
 ('hi', 7375),
 ('aa', 6918),
 ('  . ', 6765),
 ('he', 6689),
 ('.  .', 6413),
 (' .  .', 6411),
 ('..', 6165),
 (' p', 6144),
 ('  .  ', 6023),
 ('at', 5893),
 ('y ', 5889),
 ('.  . ', 5769),
 ('d ', 5741),
 ('al', 5723),
 (' i', 5649),
 (' d', 5550),
 ('h ', 5542),
 ('on', 5377),
 ('me', 5377),
 (' th', 5303),
 ('er', 5298),
 ('re', 5185),
 ('ka', 5137),
 ('ma', 5040),
 (' n', 4968),
 (' c', 4944),
 ('ra', 4929),
 ('ch', 4857),
 ('to', 4717),
 (' w', 4694),
 ('is', 4641),
 ('ho', 4605),
 (' ha', 4547),
 ('nd', 4481),
 (' l', 4432),
 ('hai', 4402),
 ('na'