# Description of Notebook

This notebook creates Tf-Idf vector from scrapped wikipedia pages and divides it into defined risk tiers. It then generates Tf-Idf weighted word embedding and averages word embedding for each risk tier

In [42]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from gensim.models import KeyedVectors
import pickle

# Data loading

Load the processed wikipedia file

In [16]:
wiki = pd.read_csv('processed_wiki.csv')

# Tf-Idf Vector

Generate Tf-Idf Vector for each crime related page and create dictonary that contains all the Tf-Idf words with their respective scores

In [17]:
tf_idf_vect = TfidfVectorizer(stop_words=None)
final_tf_idf = tf_idf_vect.fit_transform(wiki['processed_content'])
tfidf_weight = tf_idf_vect.fit(wiki['processed_content'])
tfidf_feat = tf_idf_vect.get_feature_names()
dictionary = dict(zip(tfidf_weight.get_feature_names(), list(tfidf_weight.idf_)))

Determines score attached with each tier

In [18]:
def unpack_word_weight(vect, word_weight):
    feature_names = np.array(vect.get_feature_names())
    data = word_weight.data
    indptr = word_weight.indptr
    indices = word_weight.indices
    n_docs = word_weight.shape[0]
    
    word_weight_list = []
    for i in range(n_docs):
        doc = slice(indptr[i], indptr[i + 1])
        count, idx = data[doc], indices[doc]
        feature = feature_names[idx]
        word_weight_dict = Counter({k: v for k, v in zip(feature, count)})
        word_weight_list.append(word_weight_dict)
    
    return word_weight_list

Extracts top_n words for each crime page

In [19]:
def get_top_words(wiki, name, column_name, top_n = 30):
    row = wiki.loc[wiki['crime'] == name, column_name]
    word_weight_dict = row.to_dict()[row.index[0]]
    
    if top_n is None:
        top_n = len(word_weight_dict)
    
    word_weight_table = word_weight_dict.most_common(top_n)
    
    return word_weight_table

In [20]:
wiki['tfidf_weight'] = unpack_word_weight(tf_idf_vect, final_tf_idf)

Generates a list of words and score for all the crimes that are defined in the High Risk tier

In [39]:
high = []
for c in wiki.crime[:9]:
    high = high + get_top_words(wiki, name = c, column_name = 'tfidf_weight')
high = list(dict(sorted(high, key=lambda v: int(v[1]))).items()) 



high_crime = []

for h in high:
    high_crime.append(h[0])

high_crime.append('victim')
high_crime.append('labor')

#len(high_crime)
high_crime = (list(set(high_crime)))

Generates a list of words and score for all the crimes that are defined in the Medium Risk tier

In [26]:
med = []
for c in wiki.crime[9:16]:
    med = med + (get_top_words(wiki, name = c, column_name = 'tfidf_weight'))
med = list(dict(sorted(med, key=lambda v: int(v[1]))).items()) 

med_crime = []

for m in med:
    med_crime.append(m[0]) 

med_crime.append('license')

len(med_crime)

mid_crime = list(set(med_crime))

Generates a list of words and score for all the crimes that are defined in the Low Risk tier

In [27]:
low = []
for c in wiki.crime[16:]:
    low = low + (get_top_words(wiki, name = c, column_name = 'tfidf_weight'))
low = list(dict(sorted(low, key=lambda v: int(v[1]))).items()) 

low_crime = []

for l in low:
    low_crime.append(l[0])   
    
low_crime.append('trespass')

low_crime = list(set(low_crime))
len(low_crime)

low_crime = list(set(low_crime))

# Word Embeddings

Load pre-trained word2vec model from google

In [32]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

Generate the tf-idf weighted word embedding by averaging the word embedding for each tier 

In [33]:
def get_embed(wordlist):  
    wordvecs = np.zeros((len(wordlist),300))
    for i,w in enumerate(wordlist):
        try:
            weight = dictionary[w]
            # tf-idf weighted word embedding 
            wordvecs[i,:] = weight*model.get_vector(w.lower())
        except Exception as e:
            wordvecs[i,:] = np.zeros((1,300))
            #print(e)
    
    # Averaging word embedding for the tier
    sentence = np.mean(wordvecs,0)
    #return sentence
    if np.sum(sentence)!=0:
        return sentence
    else:
        return np.NaN   

Word embedding for High Tier

In [38]:
high_embed = get_embed(high_crime)

high_embed

array([ 1.66563795e-02,  2.75639851e-05,  6.58133110e-03,  1.15362381e-01,
       -1.12263474e-01, -1.67724720e-02, -1.71447354e-02, -1.21672825e-01,
        1.90876849e-01,  1.07547933e-01,  8.18550745e-02, -2.25967544e-01,
       -8.86399348e-02,  2.55772859e-01, -2.57462823e-01,  1.59201063e-01,
        6.99797672e-03,  1.32715057e-01,  4.01658587e-02, -2.07034232e-01,
        9.99496008e-02, -8.03316605e-02,  1.75000246e-01,  4.20258883e-02,
        4.32338846e-02, -1.93033840e-01, -1.22409593e-01,  6.50572766e-02,
        1.24992818e-01, -8.22591655e-02,  5.03577405e-02, -2.93653618e-01,
       -8.80609681e-02, -4.30972513e-02, -7.32624135e-02, -4.50813845e-02,
        4.17779185e-02,  2.48188630e-02,  1.14958005e-01,  1.10998112e-01,
        1.30344258e-01, -3.89893697e-04,  1.72952345e-01,  3.13931934e-02,
       -1.04482282e-01, -1.89876289e-01, -4.23026039e-02,  3.96798439e-02,
       -1.75049056e-01,  1.38373802e-01,  3.23640238e-02, -3.58440592e-02,
       -2.42092773e-02, -

Word embedding for Medium Tier

In [35]:
med_embed = get_embed(med_crime)

med_embed

array([ 5.81950353e-02,  6.41510151e-02,  1.78538812e-02,  5.49157229e-02,
       -1.00157124e-01, -2.90225959e-02,  1.21040946e-01, -8.35936574e-02,
        2.33950267e-01,  7.15818187e-02, -7.41720208e-02, -1.19445576e-01,
       -4.16331461e-02,  1.57277056e-01, -3.07686760e-01,  2.85469424e-01,
        2.79092422e-02,  1.47258002e-01,  7.35482761e-03, -8.04371123e-02,
        7.44380707e-02, -7.91806176e-02,  1.40983978e-01,  8.48905213e-02,
        1.36049820e-01, -1.11672723e-01, -1.50874908e-01,  1.53289936e-01,
       -2.55875508e-02, -7.13277213e-02, -9.41228135e-03, -8.16996907e-02,
       -7.80824931e-02,  2.02443865e-02, -7.52497393e-02, -9.20047766e-02,
        1.27372687e-01,  1.55660712e-02,  1.40367984e-01,  1.62658955e-03,
        1.00181413e-01,  1.57151474e-02,  2.04447676e-01, -1.10253238e-01,
       -2.05263879e-01, -3.04302786e-01, -5.78033382e-02,  6.35607378e-02,
       -1.63857283e-01,  5.29697619e-02,  7.03517368e-02, -3.36516679e-04,
       -1.60566128e-03, -

Word embedding for Low Tier

In [40]:
low_embed = get_embed(low_crime)

low_embed

array([ 0.14813593, -0.02707243, -0.02890444,  0.09945933, -0.09557853,
        0.10187331,  0.09719155, -0.06948838,  0.20442467,  0.03834116,
       -0.00484999, -0.19836949, -0.06840986,  0.16700958, -0.3069031 ,
        0.21861507,  0.01980748,  0.19674337, -0.0454859 , -0.21173372,
        0.09618989, -0.02171794,  0.10167556, -0.01472529,  0.08610058,
       -0.15383089, -0.14833613,  0.15375867,  0.13167516, -0.07155647,
       -0.02327728, -0.17658005, -0.11016174,  0.04476408, -0.02369752,
       -0.1149169 ,  0.0670832 , -0.01809969,  0.10790066,  0.08747662,
        0.03687009,  0.05455543,  0.15199049,  0.06435531, -0.08663739,
       -0.23440654, -0.07179636,  0.10926264, -0.13490207,  0.10765513,
       -0.06220825,  0.1314082 , -0.05101376, -0.08854521,  0.00618495,
        0.040091  , -0.15783443, -0.20731361,  0.02551038, -0.14265999,
        0.00916975,  0.07034876, -0.142984  , -0.01773321, -0.11610693,
       -0.03240588, -0.20385974,  0.1245672 , -0.0071524 ,  0.09

Pickle all the neccesary stuff

In [41]:
with open('tfidf_dict.pickle', 'wb') as handle:
    pickle.dump(dictionary, handle)

In [33]:
with open('high_list.pickle', 'wb') as handle:
    pickle.dump(high, handle)

In [34]:
with open('med_list.pickle', 'wb') as handle:
    pickle.dump(med, handle)

In [35]:
with open('low_list.pickle', 'wb') as handle:
    pickle.dump(low, handle)

In [36]:
with open('high_embed.pickle', 'wb') as handle:
    pickle.dump(high_embed, handle)

In [37]:
with open('med_embed.pickle', 'wb') as handle:
    pickle.dump(med_embed, handle)

In [38]:
with open('low_embed.pickle', 'wb') as handle:
    pickle.dump(low_embed, handle)