In [1]:
import nltk
nltk.download('averaged_perceptron_tagger')
import pickle as pk
import pandas as pd
from gensim.models import KeyedVectors

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nimori/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin'
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [3]:
categories={'food':['ration', 'shop', 'community', 'kitchen'], 'jobs':['training', 'professional', 'job'], 
           'money':['invest','save','bank','donation'],
           'utilities':['internet', 'phone', 'electricity', 'water', 'landlord', 'hotel', 'shelter', 'lpg', 'waste'],
           'medical':['hospitals', 'facilities', 'specialists', 'blood'],
           'education':['school', 'college', 'tuitions', 'career', 'consultations'],
           'medical':['hospitals', 'facilities', 'specialists', 'blood'], 'security':['police', 'theft', 'army', 'guard'],
           'infrastructure':['road', 'bridge', 'sewage', 'traffic'],
           'buy':['shopkeeper', 'land', 'apartment', 'furniture', 'electronics', 'rental'],
           'sell':['shopkeeper', 'land', 'apartment', 'furniture', 'electronics', 'rental'],
           'government':['schemes', 'corruption'], 'politics':['politics'],
           'emergency':['covid', 'blood', 'robbery', 'crime'],
           'travel':['transport', 'cab', 'public', 'auto', 'hotel', 'traffic', 'tourism', 'tolls'],
           'services':['business', 'legal', 'accountant', 'carpenter', 'mechanic', 'electrician', 'plumber', 'house', 'help', 'labour'],
           'other':['parking', 'women', 'human', 'rights', 'consumer', 'sanitation'], 'technology':['technology'], 'environment':['environment', 'animals']}

In [5]:
sent= pk.load(open('translated.pk', 'rb'))
#sent would be a list in which each element is another list that comprises of the following 3 things:
##1) an ID
##2) original text
##3) translated text (in the form of another list)
                            
                             ######Example shown below########
###[1601345851000,'मुख्यमंत्री गहलोत बोले-प्रदेश में जब भी कोई नया जिला बनेगा तो सबसे पहले ब्यावर का होगा नाम',
###  ['Chief Minister Gehlot said - whenever a new district is formed in the state, Beawar will be named first.']]

In [6]:
text= []
for i in range(len(sent)):
    text.append(sent[i][2]) #to select just the translated text
#text

## Data Preprocessing

In [7]:
from nltk.tokenize import sent_tokenize
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
def get_unigram(text):
    '''
    preprocessing: tokenization; stemming
    '''
    tt=[]
    for j in text:
        j=re.sub(r"\W", " ",j, flags=re.I).strip()
        
        for i in j.split():
#             i=ps.stem(i.lower())
            i=i.lower()
            
            if i.isdigit() or len(i)<=2:
                continue
            
            if i in stopwords.words('english'):
                continue
            else:
                tt.append(i)
    return list(set(tt))

[nltk_data] Downloading package stopwords to /home/nimori/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
text2= []
for txt in text:
    sentence= ' '.join([str(elem) for elem in txt])
    tagged_sentence = nltk.tag.pos_tag(sentence.split())
    edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    sent_final= ' '.join(edited_sentence)
    text2.append([sent_final])
    #print(' '.join(edited_sentence))
#text2

## Sentence Tokenization

In [10]:
tokens= []
for txt in text2:
    tokens.append(get_unigram(txt))
#tokens

## Determining Cosine Similarity using Word Vectors

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## POC for first token

In [12]:
tk={}
for i in tokens[0]:

        vec1=google_word2vec[i]
        cat = {}
        for j in categories.keys():
            vec2=google_word2vec[j]
            sim = cosine_similarity(vec1.reshape(1, -1),vec2.reshape(1, -1))
            if sim>0.1:
                cat[j]=sim[0][0]
        categ=sorted(cat.items(),key=lambda x:x[1])[::-1]
        if categ!=[]:
            for k in categories[categ[0][0]]:
                print(k)
            tk[i]=categ[0][0]
#tk

environment
animals
covid
blood
robbery
crime
technology
technology
schemes
corruption
school
college
tuitions
career
consultations
parking
women
human
rights
consumer
sanitation


In [15]:
cat= list(categories.keys())
#cat[0]
#tk

In [16]:
col= []
for cat in tk.values():
    vec1=google_word2vec[cat]
    for token in tk.keys():
        vec2=google_word2vec[token]
        row= {}
        for subcat in categories.get(cat):
            row[subcat]= cosine_similarity(vec1.reshape(1, -1),vec2.reshape(1, -1)).tolist()
    col.append(row)
col

[{'environment': [[0.08332595229148865]], 'animals': [[0.08332595229148865]]},
 {'covid': [[0.20476628839969635]],
  'blood': [[0.20476628839969635]],
  'robbery': [[0.20476628839969635]],
  'crime': [[0.20476628839969635]]},
 {'technology': [[0.0332690067589283]]},
 {'technology': [[0.0332690067589283]]},
 {'schemes': [[0.09642307460308075]], 'corruption': [[0.09642307460308075]]},
 {'school': [[0.0464131236076355]],
  'college': [[0.0464131236076355]],
  'tuitions': [[0.0464131236076355]],
  'career': [[0.0464131236076355]],
  'consultations': [[0.0464131236076355]]},
 {'parking': [[0.2472761571407318]],
  'women': [[0.2472761571407318]],
  'human': [[0.2472761571407318]],
  'rights': [[0.2472761571407318]],
  'consumer': [[0.2472761571407318]],
  'sanitation': [[0.2472761571407318]]}]

In [17]:
li_0= []
li= []
for dic in col:
    li_0.append(list(dic.values()))
    li.append(li_0[0][0][0])