# Description of Notebook

This notebook predicts Risk Rating of each record in the World-check data and generates a csv file with Risk Rating assigned to each tier

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from gensim.models import KeyedVectors
import numpy as np
import pickle

# Data Loading

Replace 'sample_data.csv' with actual data when running the code

In [3]:
df = pd.read_csv('sample_data.csv', sep='\t', encoding='windows-1252')

  interactivity=interactivity, compiler=compiler, result=result)


# Pre-processing steps

Perform all the pre-processing steps such as lemmatizing, lower case conversion, removing stop words & punctuations

In [4]:
def lemmatize_stemming(data):
     """lemmatizes content of each world-check record"""
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + WordNetLemmatizer().lemmatize(w)
    return new_text

In [5]:
def convert_lower_case(data):
    """converts content of each world-check record into lower case"""
    return np.char.lower(data)

In [6]:
def remove_stop_words(data):
     """removes stop words from each world-check record"""
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [7]:
def remove_punctuation(data):
    """removes punctuations from each world-check record"""
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [8]:
def remove_apostrophe(data):
    """removes apostrophe from the each world-check record"""
    return np.char.replace(data, "'", "")

In [9]:
def preprocess(data):
    """single function that combines all the preprocessing functions
       You may notice that some functions are called more than once that is 
       beccause after lemmatizing step you may encounter few stop words again 
    """
    data = convert_lower_case(data)
    data = remove_punctuation(data) 
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = lemmatize_stemming(data)
    data = remove_punctuation(data)
    data = lemmatize_stemming(data) 
    data = remove_punctuation(data) 
    data = remove_stop_words(data) 
    return data

Add new column that cleans 'Further Information' column of world-check which we will use to predict the Risk rating

In [10]:
df['processed_furtherinfo'] = df['FURTHER INFORMATION'].apply(preprocess) 

# Load necessary Pickle files

In [13]:
with open('high_list.pickle', 'rb') as handle:
    high = pickle.load(handle)

high_crime = []

for h in high:
    high_crime.append(h[0])

high_crime.append('victim')
high_crime.append('labor')


high_crime.remove('report')

In [14]:
with open('med_list.pickle', 'rb') as handle:
    med = pickle.load(handle)

med_crime = []

for m in med:
    med_crime.append(m[0]) 

med_crime.append('license')
med_crime.remove('report')

In [15]:
with open('low_list.pickle', 'rb') as handle:
    low = pickle.load(handle)

low_crime = []

for l in low:
    low_crime.append(l[0])   
    
low_crime.append('trespass')
low_crime.remove('report')

In [16]:
with open('high_embed.pickle', 'rb') as handle:
    high_embed = pickle.load(handle)

In [17]:
with open('med_embed.pickle', 'rb') as handle:
    med_embed = pickle.load(handle)

In [18]:
with open('low_embed.pickle', 'rb') as handle:
    low_embed = pickle.load(handle)

In [19]:
with open('tfidf_dict.pickle', 'rb') as handle:
    dictionary = pickle.load(handle)

## Load pre-trained word2vec model from Google

In [20]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [21]:
def tokenize(data):
    """Tokenize the further information column of world-check"""
    tokens = word_tokenize(data)
    return (list(set(tokens)))

df['tokens'] = df.processed_furtherinfo.apply(tokenize)

## Compare the tokenized words with words attached to each tier to determine the match of tier

This method of predicting Risk Tier didn't yield accepted result, so we decided not to consider this method

In [24]:
def highcheck(data):
    new_text = ""
    for d in data:
        if d in high_crime:
            new_text = new_text + " " + d
    return new_text

df['high_severe'] = df.tokens.apply(highcheck)

In [25]:
def medcheck(data):
    new_text = ""
    for d in data:
        if d in med_crime:
            new_text = new_text + " " + d
    return new_text

df['med_severe'] = df.tokens.apply(medcheck)

In [35]:
def lowcheck(data):
    new_text = ""
    for d in data:
        if d in low_crime:
            new_text = new_text + " " + d
    return new_text

df['low_severe'] = df.tokens.apply(lowcheck)

## Get the score of each matched word from Tf-idf vector and add the scores to get the final score

This method was comparatively more accurate than previous method but it still didn't yield the expected results, so we decided not to consider this method either

In [27]:
def high_weight(data):
    x = tokenize(data)
    z = 0
    for x1 in x:
        z = z + (next(score for (crime, score) in high if crime == x1))
    return z

df['high_score'] = df.high_severe.apply(high_weight)

In [28]:
def med_weight(data):
    x = tokenize(data)
    z = 0
    for x1 in x:
        z = z + (next(score for (crime, score) in med if crime == x1))
    return z

df['med_score'] = df.med_severe.apply(med_weight)

In [36]:
def low_weight(data):
    x = tokenize(data)
    z = 0
    for x1 in x:
        z = z + (next(score for (crime, score) in low if crime == x1))
    return z

df['low_score'] = df.low_severe.apply(low_weight)

# Get the word embedding of each record and compare it with the generated word embedding of each risk tier using Cosine distance

Generate the tf-idf weighted word embedding by averaging the word embedding for each record

In [30]:
def get_embed(data): 
    wordlist = tokenize(data)
    wordvecs = np.zeros((len(wordlist),300))
    for i,w in enumerate(wordlist):
        try:
            weight = dictionary[w]
            wordvecs[i,:] = weight*model.get_vector(w.lower())
        except Exception as e:
            wordvecs[i,:] = np.zeros((1,300))
            #print(e)
            
    sentence = np.mean(wordvecs,0)
    #return sentence
    if np.sum(sentence)!=0:
        return sentence
    else:
        return np.NaN  

Functions to generate cosine distance of each tier

In [31]:
def high_sim(data):
    embed1 = get_embed(data)
    cos_sim = np.dot(embed1, high_embed) / (np.linalg.norm(embed1) * np.linalg.norm(high_embed))
    if np.isnan(np.sum(cos_sim)):
        return 0
    return cos_sim

df['high_similarity'] = df.high_severe.apply(high_sim)

  out=out, **kwargs)


In [32]:
def med_sim(data):
    embed1 = get_embed(data)
    cos_sim = np.dot(embed1, med_embed) / (np.linalg.norm(embed1) * np.linalg.norm(med_embed))
    if np.isnan(np.sum(cos_sim)):
        return 0
    return cos_sim

df['med_similarity'] = df.med_severe.apply(med_sim)

In [37]:
def low_sim(data):
    embed1 = get_embed(data)
    cos_sim = np.dot(embed1, low_embed) / (np.linalg.norm(embed1) * np.linalg.norm(low_embed))
    if np.isnan(np.sum(cos_sim)):
        return 0
    return cos_sim

df['low_similarity'] = df.low_severe.apply(low_sim)

 ## Decide Risk tier by assigning Risk Rating of the highest score for each record

In [38]:
df['based_on_score']=df[['high_score', 'med_score', 'low_score']].idxmax(1)

In [39]:
df['based_on_similarity']=df[['high_similarity', 'med_similarity', 'low_similarity']].idxmax(1)

Rename the data accordingly

In [40]:
def change_name(data):
    if data == 'high_score':
        data = 'high'
    elif data == 'med_score':
        data = 'medium'
    elif data == 'low_score':
        data = 'low'
    elif data == 'high_similarity':
        data = 'high'
    elif data == 'med_similarity':
        data = 'medium'
    elif data == 'low_similarity':
        data = 'low'
    return data

df['rating_based_on_weight']=df.based_on_score.apply(change_name)
df['rating_based_on_similarity']=df.based_on_similarity.apply(change_name)

Store the risk calculated world-check data into a csv file

In [42]:
df.to_csv('riskcaluclated_worldcheck_new.csv', index = False)

In [None]:
df1 = df[df['rating_based_on_similarity'] == 'high']
df1.to_csv('high_rated_worldcheck.csv')

In [None]:
df2 = df[df['rating_based_on_similarity'] == 'medium']
df2.to_csv('medium_rated_worldcheck.csv')

In [None]:
df3 = df[df['rating_based_on_similarity'] == 'low']
df3.to_csv('low_rated_worldcheck.csv')