# Web page & policy tagging

## Table Of Content:
* [Install packages](#first-bullet)
* [Read in data](#second-bullet)
* [Preprocessing](#third-bullet)
* [Installing word to vec model](#third-bullet)

## 1) Install packages <a class="anchor" id="first-bullet"></a>

In [None]:
import pandas as pd
import numpy as np

#read in data
import os
import json

#preprocessing
import re, string

#tagging webpages to words
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import gensim
from gensim.models.keyedvectors import KeyedVectors
import copy



## 2) Read in data <a class="anchor" id="second-bullet"></a>

Read in web pages as json file, read in policy codes in csv format.

### Read in web pages

In [None]:
with open(file="html_data_2.json", encoding="utf-8") as jsonFile:
    jsonObject = json.load(jsonFile)
    jsonFile.close()

In [None]:
data = jsonObject['html_list']

df_webpages = pd.DataFrame(data)

In [None]:
df_webpages.shape

In [None]:
df_webpages.columns

### Read in policy codes

In [None]:
df_policy_codes = pd.read_csv('policy_codes.csv', sep=";")

In [None]:
df_policy_codes.shape

In [None]:
df_policy_codes.columns

## 3) Preprocessing <a class="anchor" id="third-bullet"></a>

Preprocessing of web pages and policy codes

In [None]:
raw_data = df_webpages.loc[:,'html']

In [None]:
raw_data.reset_index(drop=True)

### Removing punctuations, numbers, to lower,...

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
data = [preprocess_text(t) for t in raw_data]

In [None]:
df_webpages['text_preprocessed']=data

## 4) Installing word to vec model <a class="anchor" id="fourth-bullet"></a>

Word to vec model from: http://vectors.nlpl.eu/repository/

### From CoNLL17 corpus to gensim word2vec (ONLY RUN 1 TIME)

In [None]:
'''col_names = ["word"]
for i in range(0,100):
    col_names.append("vec" + str(i))'''

In [None]:
#embedded_dic = pd.read_csv('model.txt', skiprows=1, sep=' ', encoding='latin-1', names = col_names, index_col=False)

In [None]:
#embedded_dic['word'] = embedded_dic['word'].str.encode('latin-1')

In [None]:
#embedded_dic['word'] = embedded_dic['word'].str.decode('utf-8', errors='ignore') 

In [None]:
#embedded_dic.index = embedded_dic['word']

In [None]:
#embedded_dic = embedded_dic.drop(columns = 'word')

In [None]:
'''np.savetxt('embedded_dic_english.txt', embedded_dic.reset_index().values, 
           delimiter=" ", 
           header="{} {}".format(len(embedded_dic), len(embedded_dic.columns)),
           comments="",
           fmt=["%s"] + ["%.18e"]*len(embedded_dic.columns), encoding = 'utf-8')''' #save the model as a model you can use with gensim

### Create similarity measure between documents

In [None]:
stopW = set(stopwords.words('english'))
punctuation_map = dict((ord(char), None) for char in string.punctuation)

class DocSimV1(object):
    def __init__(self, w2v_model , stopwords=stopW , remove_punctuation_map=punctuation_map):
        self.w2v_model = w2v_model
        self.stopwords = stopwords
        self.remove_punctuation_map = punctuation_map

    def vectorize(self, doc):
        """Identify the vector values for each word in the given document"""
        doc = doc.lower()
        words = [w.translate(punctuation_map) for w in doc.split(" ") if w not in self.stopwords]
        word_vecs = []
        for word in words:
            try:
                vec = self.w2v_model[word]
                word_vecs.append(vec)
            except KeyError:
                # Ignore if the word doesn't exist in the vocabulary
                pass

        # Assuming that document vector is the mean of all the word vectors
        vector = np.mean(word_vecs, axis=0)
        return vector


    def _cosine_sim(self, vecA, vecB):
        """Find the cosine similarity distance between two vectors."""
        csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
        if np.isnan(np.sum(csim)):
            return 0
        return csim

    def calculate_similarity(self, source_doc, target_docs=[], threshold=0):
        """Calculates & returns similarity scores between given source document & all
        the target documents."""
        if isinstance(target_docs, str):
            target_docs = [target_docs]

        source_vec = self.vectorize(source_doc)
        results = []
        for doc in target_docs:
            target_vec = self.vectorize(doc)
            sim_score = self._cosine_sim(source_vec, target_vec)
            if sim_score > threshold:
                results.append({
                    'score' : sim_score,
                    'doc' : doc
                })
            # Sort results by score in desc order
            #results.sort(key=lambda k : k['score'] , reverse=True)

        return results

### Practise to see if model works

In [None]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('embedded_dic_english.txt', binary=False)

In [None]:
ds = DocSimV1(word_vectors)

In [None]:
ds.calculate_similarity("passport", ["travel airport documents","travel","sugar", "Identity card", "documents"])

## Tagging webpages to policies

In [None]:
def create_tags(org_data, policy_codes,threshold):
    
    index_text = org_data.columns.get_loc("text_preprocessed")
    main_policy_codes=policy_codes["Policy_code"][policy_codes["Policy_code"].str.len() == 1]
    
    all_tags =[]
    policy_codes_list = []

    for main_code in main_policy_codes:
        temp_one_policy = policy_codes[policy_codes.Policy_code.str.startswith(main_code)]
        sub_policy_codes=temp_one_policy["Policy_code"][temp_one_policy["Policy_code"].str.len() ==2]
        policy_codes_list.append(sub_policy_codes.values.tolist())       
        
        sub_policy_list = []

        for sub_policy_code in sub_policy_codes:
            sub_policy_tag = policy_codes[policy_codes["Policy_code"]==sub_policy_code]
                       
            sub_policy_tag = sub_policy_tag["relevant_tags"].str.split(',', expand=True)
                        
            sub_policy_list.append(sub_policy_tag.values.tolist()[0])
            
        all_tags.append(sub_policy_list)            
               
    #print(policy_codes_list)                
    
    temp = copy.deepcopy(org_data)
    
    ds = DocSimV1(word_vectors)
    
    for code in main_policy_codes:
        temp[code]=0 
           
    temp['tags'] = ""
    temp['subtags'] = ""
    temp['tag_word'] =""
    
    index_tags = temp.columns.get_loc("tags")  
    index_subtags = temp.columns.get_loc("subtags")

    index_tagwords = temp.columns.get_loc("tag_word")

    #dubbelcheck if the words in the brackets are the words we want to match with
    
    # combine the categories in a list
      
    tags_en = []
    tags_en = all_tags
    #print(tags_en)
          
    #define the names of the subcategories
          
    subtags_names = []
    subtags_names = policy_codes_list
    #print(subtags_names)
    

    #splitting into two word comparison
    for i in range(0,len(temp)): # doc per doc
        words = temp.iloc[i,index_text].split()
        word_filters = [] #contains all three-words of a document 
        
           
        for w in range(0,len(words)-1):          
            word_filters.append(words[w] + " " + words[w+1])
        
        for word_filter in word_filters:
            
            for t in range(0,len(tags_en)): # every main tag
                for subt in range(0,len(tags_en[t])): # for every subtag
                    sim_scores = ds.calculate_similarity(word_filter, tags_en[t][subt]) # compute similarity between preprocessed text and tag
                    scores = [sim_scores[k].get('score') for k in range(0,len(sim_scores))]
                    col_pos = index_text + 1 +t
                    score_max = max(scores, default=0)
                
                    if score_max > temp.iloc[i,col_pos]:
                        temp.iloc[i,col_pos] = float(score_max) # add max value of match under the tag variable

                    if score_max >=threshold:
                        index_max = np.argmax(scores)
                                                                       
                        if temp.columns[index_text + 1 + t] not in temp.iloc[i,index_tags]:
                            temp.iloc[i,index_tags] = temp.iloc[i,index_tags] + temp.columns[index_text + 1 +t] + ", " #add main tag
                                         
                        if subtags_names[t][subt] not in temp.iloc[i,index_subtags]:
                            temp.iloc[i,index_subtags] = temp.iloc[i,index_subtags] + subtags_names[t][subt] + ", " #add sub tag
                        
                        if tags_en[t][subt][index_max] not in temp.iloc[i,index_tagwords]:
                            temp.iloc[i,index_tagwords] = temp.iloc[i,index_tagwords] + tags_en[t][subt][index_max] + ", " #add tag word                           
       
    return temp

In [None]:
output = create_tags(df_webpages[222:223], df_policy_codes, 0.95)

In [None]:
pd.set_option('display.max_columns', 41)

In [None]:
output

In [None]:
output.to_csv('webpages_policies_tagged.csv', index= False)
