# Assignment 3, Indexing

In this notebook you will index DBpedia (see the sub-collections listed under `https://github.com/uis-dat640-fall2019/admin/tree/master/assignments/assignment-3#data`). 

Make sure you specify the index settings, analyzer, and fields appropriately for to support the models to be implemented in subsequent notebooks.

Note: you'll need to build a positional index. Use a single shard to make sure you're getting the right term statistics.

Be sure to use both markdown cells with section headings and explanations, as well as writing readable code, to make it clear what your intention is each step of the way through the code. 

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers

import re
import pandas as pd
from pprint import pprint

es = Elasticsearch()
# es.info()

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_list = stopwords.words('english')
porter = PorterStemmer()


from IPython.display import clear_output, display, HTML
import warnings
warnings.filterwarnings('ignore')

import winsound
duration = 500  # milliseconds
freq = 1000  # Hz

In [None]:
index_name_term = 'test_1'
index_name_entity = 'test_1_entities'

In [None]:
INDEX_SETTINGS = {
    'settings' : {
        'index' : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        },
        'analysis': {
            'analyzer': {
                'my_english_analyzer': {
                    'type': "custom",
                    'tokenizer': "standard",
                    'stopwords': "_english_",
                    'filter': [
                        "lowercase",
                        "english_stop",
                        "filter_english_minimal"
                    ]                
                }
            },
            'filter' : {
                'filter_english_minimal' : {
                    'type': "stemmer",
                    'name': "porter2"
                },
                'english_stop': {
                    'type': "stop",
                    'stopwords': "_english_"
                }
            },
        }
    },
    'mappings': {
        'properties': {
            'title': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "my_english_analyzer"
            },
            'content': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "my_english_analyzer"
            }
        }
    }
}

In [None]:
if es.indices.exists(index_name_term):
    es.indices.delete(index=index_name_term)
    
es.indices.create(index=index_name_term, body=INDEX_SETTINGS)

In [None]:
if es.indices.exists(index_name_entity):
    es.indices.delete(index=index_name_entity)
    
es.indices.create(index=index_name_entity, body=INDEX_SETTINGS)

In [None]:
all_data = {}

In [None]:
count = 0
chunk = pd.read_csv("data/labels_en.ttl", sep = " ", header = None, skiprows=1, error_bad_lines = False, warn_bad_lines = False)

def process_label(label):
    clear_output()
    text = " ".join(re.findall('[A-Z][^A-Z]*', label.replace("@en", '')))
    
    label_words = text.split()
    
    if all([True if len(word) == 1 else False for word in label_words]) == True:
        text = "".join(label_words)
            
    print(text)
    
    return text

chunk = chunk.rename(columns={0: 'link', 2: 'label'})[['link', 'label']]
print(chunk.shape)
chunk['label'] = chunk['label'].apply(lambda label: process_label(label))
print(chunk.shape)
count += chunk.shape[0]
all_data.update(chunk.set_index('link').T.to_dict())
clear_output()
print(chunk.shape, count, len(all_data))

In [None]:
categories = {}
chunk = pd.read_csv("data/article_categories_en.ttl", sep = " ", header = None, skiprows=1, error_bad_lines = False, warn_bad_lines = False)

count = 0

chunk = chunk.rename(columns={0: 'link', 2: 'categories'})[['link', 'categories']]
chunk['categories'] = chunk['categories'].apply(lambda x: x.split("/")[-1].split(":")[-1].replace(">", '').replace("_", " "))
print(chunk.shape)
chunk = chunk.groupby('link')['categories'].apply(list).to_frame().reset_index()
print(chunk.shape)
categories.update(chunk.set_index('link').T.to_dict())
len(categories)

In [None]:
disambiguations = {}

chunk = pd.read_csv("data/disambiguations_en.ttl", sep = " ", header = None, skiprows=1, error_bad_lines = False, warn_bad_lines = False)

chunk = chunk.rename(columns={0: 'link', 2: 'disambiguations'})[['link', 'disambiguations']]
chunk['link'] = chunk['link'].apply(lambda x: x.replace("_(disambiguation)", ''))
chunk['disambiguations'] = chunk['disambiguations'].apply(lambda x: x.split("/")[-1].replace(">", '').replace("_", ' ')) # add < > and underscore for entity
print(chunk.shape)
chunk = chunk.groupby('link')['disambiguations'].apply(list).to_frame().reset_index()
print(chunk.shape)
disambiguations.update(chunk.set_index('link').T.to_dict())

print(len(disambiguations))
winsound.Beep(freq, duration)

In [None]:
person_data = {}

chunk = pd.read_csv("data/persondata_en.ttl", sep = " ", header = None, skiprows=1, error_bad_lines = False, warn_bad_lines = False)

chunk = chunk.rename(columns={0: 'link', 1: 'key', 2: 'value'})[['link', 'key', 'value']]
chunk['key'] = chunk['key'].apply(lambda key: key.split("/")[-1].replace('>', '').split("#")[-1])
chunk = chunk.dropna()
chunk['value'] = chunk['value'].apply(lambda value: value.replace('@en', '').split("^^")[0].split("/")[-1].replace('>', ''))
chunk = chunk.groupby('link')

for row in chunk:
    person_data.update({ row[0]:  row[1][["key", "value"]].T.to_dict()})

winsound.Beep(freq, duration)

In [None]:
def preprocess(text):
    text_words = []
    for term in text.split():
        if term not in stop_list:
            text_words.append(porter.stem(term))
    
    return " ".join(text_words)

def modify_text(text):
#     clear_output()
#     print(text)
    
    try:
        tokens = es.indices.analyze(index = index_name_term, body = {
            'analyzer': 'my_english_analyzer',
            'text': text.replace("@en", '')
        })['tokens']

        text = " ".join([token['token'] for token in tokens])
        
    except:
        text = preprocess(text)
        
#     print("\n-----------------------------------------------------------------------------------------------------\n")
#     print(text)
    return text

# winsound.Beep(freq, duration)

In [None]:
long_abstracts = {}
chunk_size = 1000
chunks = pd.read_csv("data/long_abstracts_en.ttl", sep = " ", header = None, skiprows=1, chunksize = chunk_size, error_bad_lines = False, warn_bad_lines = False)

i = 1
count = 0
for chunk in chunks:
    clear_output()
    print("Done with", (i-1)*chunk_size)
    print("Processing chunk", i)
    
    chunk = chunk.rename(columns={0: 'link', 2: 'abstract'})[['link', 'abstract']]
#     chunk['links'] = chunk['link'].apply(lambda link: [link])
    chunk['abstract'] = chunk['abstract'].apply(lambda text: modify_text(text))
    
    count += chunk.shape[0]
    chunk = chunk.set_index('link').T.to_dict()
    long_abstracts.update(chunk)
#     
    i += 1
# long_abstracts
winsound.Beep(freq, duration)

In [None]:
count = 0
chunk_size = 10000
chunks = pd.read_csv("data/page_links_en.ttl", sep = " ", header = None, skiprows=1, chunksize = chunk_size, error_bad_lines = False, warn_bad_lines = False)

def add_pagelinks(row):
    global count
    
    if row['link'] in all_data.keys():        
        count += 1
        
        
        break_loop = False
        for link in row['related_links']:
            if link in long_abstracts.keys():
                break_loop = True
                all_data[row['link']]['long_abstract'] = long_abstracts[link]['abstract']
                
            if link in person_data.keys():
                break_loop = True
                for i, data in person_data[link].items():
                    all_data[row['link']][data['key']] = data['value']
                    
            if link in categories.keys():
                break_loop = True
                all_data[row['link']]['categories'] = categories[link]['categories']
                
            if link in disambiguations.keys():
                break_loop = True
                all_data[row['link']]['disambiguations'] = disambiguations[link]['disambiguations']
            
            if break_loop == True:
                break
                
        
        if 'related_links' in all_data[row['link']] and all_data[row['link']]['related_links'] != None:
            all_data[row['link']]['related_links'] = all_data[row['link']]['related_links'] + [link for link in row['related_links'] if link not in all_data[row['link']]['related_links']]
        else:
            all_data[row['link']]['related_links'] = [row['link']] + row['related_links']
            
        clear_output() 
        
        print(count, row['link'], " | ", all_data[row['link']])
        
            
for chunk in chunks:
    chunk = chunk.rename(columns={0: 'link', 2: 'related_links'})[['link', 'related_links']]
    chunk = chunk.groupby('link')['related_links'].apply(list).to_frame().reset_index()
    print(chunk.shape, count)
    chunk.apply(lambda row: add_pagelinks(row), axis = 1)

# winsound.Beep(freq, duration)

In [None]:
winsound.Beep(freq, duration)

In [None]:
import pickle

pickle.dump(all_data, open("data/index_data.p", "wb"))

In [None]:
del long_abstracts
del person_data
del categories
del disambiguations

In [None]:
i = 0
count = 0
for key in list(all_data):
    data = all_data[key]        
    if 'label' not in data.keys():
        count += 1
        del all_data[key]
        
    else:
        clear_output()
        print(i)
        i += 1
        
        all_data[i] = data

        del all_data[key]

In [None]:
def gendata_terms(data, index):
    for _id, doc_ in data.items():
        # Some preprocessing
        clear_output()
        
        doc = doc_.copy()
        
        doc['content'] = ""
            
        if 'related_links' in doc.keys():
            doc['content'] += " ".join(doc['related_links'])
        
        if 'long_abstract' in doc.keys():
            doc['content'] = " "+doc['long_abstract']
        
        if 'categories' in doc.keys():
            doc['content'] += " "+" ".join(doc['categories'])
                
        if 'disambiguations' in doc.keys():
            doc['content'] += " "+" ".join(doc['disambiguations'])
            
        if 'type' in doc.keys():
            doc['content'] += " "+doc['type']

            
        for key in ['name', 'surname', 'givenName']:
            if key in doc.keys():
                doc['content'] += " "+doc[key]
                
        for key in ['birthPlace', 'deathPlace']:
            if key in doc.keys():
                doc['content'] += " "+doc[key]
                
        for key in ['birthDate', 'deathDate']:
            if key in doc.keys():
                doc['content'] += " "+doc[key]
                
        
        
        print(_id, doc)
#         break
        yield {
            "_index": index,
            "_id": _id,
            "_source": doc,
        }
    
helpers.bulk(es, gendata_terms(all_data, index_name_term))

In [None]:
def gendata_entities(data, index):
    for _id, doc_ in data.items():
        # Some preprocessing
        clear_output()
        
        doc = doc_.copy()
        
        if 'long_abstract' in doc.keys():
            del doc['long_abstract']
        
        doc['content'] = ""
        label_words = re.findall('[A-Z][^A-Z]*', doc['label'])
        
        if all([True if len(word) == 1 else False for word in label_words]) == False:
            doc['label'] = " ".join(label_words)
            
            
        doc['label'] = "<" + doc['label'].replace(" ", "_") + ">"
            
        if 'related_links' in doc.keys():
            doc['content'] += " ".join(doc['related_links'])
        
        if 'categories' in doc.keys():
            doc['categories'] = ["<" + category.replace(" ", "_") + ">" for category in doc['categories']]
            doc['content'] += " "+" ".join(doc['categories'])
                
        if 'disambiguations' in doc.keys():
            doc['disambiguations'] = ["<" + disambiguation.replace(" ", "_") + ">" for disambiguation in doc['disambiguations']]
            doc['content'] += " "+" ".join(doc['disambiguations'])
            
        if 'type' in doc.keys():
            doc['type'] = "<" + doc['type'] + ">"
            doc['content'] += " "+doc['type']

            
        for key in ['name', 'surname', 'givenName']:
            if key in doc.keys():
                doc[key] = "<"+doc[key].replace(' ', '_')+">"
                doc['content'] += " "+doc[key]
                
        for key in ['birthPlace', 'deathPlace']:
            if key in doc.keys():
                doc[key] = "<"+doc[key].replace(' ', '_')+">"
                doc['content'] += " "+doc[key]
                
        for key in ['birthDate', 'deathDate']:
            if key in doc.keys():
                doc[key] = "<" + doc[key] + ">"
                doc['content'] += " "+doc[key]
                
        
        
        print(_id, doc)
#         break
        yield {
            "_index": index,
            "_id": _id,
            "_source": doc,
        }

helpers.bulk(es, gendata_entities(all_data, index_name_entity))