In [1]:
import pandas as pd
import glob
import re
import collections
import numpy as np
import matplotlib.pyplot as plt
import gzip

from tqdm import tqdm_notebook
from nltk.util import ngrams
from wordcloud import WordCloud
from IPython.display import display, HTML

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

# Create dataset

In [2]:
# Generate features for dataset
df = pd.concat([pd.read_csv(f, delimiter='\t') for f in glob.glob('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/data/user-ct-test-collection-*.txt')])

  if (await self.run_code(code, result,  async_=asy)):


## Sampled dataset
This sample dataset is put on 1.000.000.

In [3]:
samples = df.sample(1000000, random_state=23)

#### Save samples in pickle file & CSV

In [4]:
samples.to_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/sample_million.pickle')

In [5]:
#samples.to_csv(r'/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/sample_dataset.csv', index = True)

In [6]:
#samples = pd.read_pickle('test.pickle')

#### Create all prefix and suffix combinations for queries

In [7]:
# filter out all queries shorter than 2
def split_words(string): 
    if isinstance(string, str):
        line = re.sub(r"\.(.)", r" \1", string)
        words = line.split()
        return len(words)
    return(0)

samples['filtered'] = samples[samples.notnull()]['Query'].map(lambda query: split_words(query))
samples = samples[samples.filtered > 1]

In [8]:
def create_artificial_queries(samples): 
    artif_list = []

    for row in samples.itertuples():
        if isinstance(str(row.Query), str):
            line = re.sub(r"\.(.)", r" \1", row.Query)
            words = line.split()
            # iterate #-filtered times
            for j in range(1, len(words)):
                # Last two will be filled as [6]: 'prefix', [7]'suffix'
                temp_list = [row.Index, row.AnonID, row.QueryTime, row.ItemRank, row.ClickURL, row.Query, '', '']
                prefix = " ".join(words[:j])
                temp_list[6] = prefix
                suffix = " ".join(words[j:])
                temp_list[7] = suffix
                # Add to artificial query list
                artif_list.append(temp_list)   
    return artif_list

artif_list = create_artificial_queries(samples)
queries = pd.DataFrame.from_records(artif_list)
queries.columns = ['Index', 'AnonID', 'QueryTime', 'ItemRank', 'ClickURL', 'Query', 'Prefix', 'Suffix']

# Create N-gram, NER & Other features

In [9]:
suffixes = []

for row in queries.itertuples():
    query = row.Query
    if isinstance(query, str):
        line = re.sub(r"\.(.)", r" \1", query)
        words = line.split()
        for j in range(0, len(words)):
            suffix = " ".join(words[j:])
            suffixes.append(suffix)

#### Save it

In [10]:
f = gzip.GzipFile("/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/suffixes.npy.gz", "w")
np.save(file=f, arr=suffixes)
f.close()

#### Load it

In [11]:
# f = gzip.GzipFile('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/pickles/historical_logs.npy.gz', "r")
# suffixes = np.load(f)

#### Create N-gram features

In [12]:
historical_dict = collections.Counter(suffixes)

def ngram_freq_per_n(candidate, historical_dict, n):
    words = candidate.split()
    ngram_n = 0
    ngrams_i = ngrams(words, n)
    
    for word in ngrams_i:
        freq_g = historical_dict[" ".join(word)]
        ngram_n += freq_g
        
    return ngram_n

In [13]:
f = gzip.GzipFile("/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/historical_dict.npy.gz", "w")
np.save(file=f, arr=historical_dict)
f.close()

In [14]:
ngram_max = 6

for i in range(1, ngram_max + 1):
    ngram_name = 'ngram_' + str(i)
    queries[ngram_name] = queries[queries.notnull()]['Query'].map(lambda candidate_row: ngram_freq_per_n(candidate_row, historical_dict, i))

#### Create NER Features

In [None]:
def ner_contains_and_count_norm(candidate): 
    entities = nlp(candidate)
    contains = 0
    ner_norm = 0
    
    if len(entities) > 0:
        contains = 1
        ner_norm = len(entities) / len(candidate)
    
    return [contains, ner_norm]

queries['has_ne'], queries['ne_norm'] = zip(*queries[queries.notnull()]['Query'].map(lambda candidate_row: ner_contains_and_count_norm(candidate_row)))

#### Create "Other" features

In [None]:
def get_other_features(prefix, suffix, historical_dict):     
    # The complete query
    complete = ""
    if bool_space:
        complete = prefix + suffix
    else:
        complete = prefix + " " + suffix
    
    # The frequency of the candidate in the historical logs
    frequency = historical_dict[complete]
    
    # Prefix, suffix and total length in characters
    prefixlen_char = len(prefix)
    suffixlen_char = len(suffix)
    totallen_char = len(complete)
    
    # Prefix, suffix and total length in words
    prefixlen_word = len(prefix.split())
    suffixlen_word = len(suffix.split())
    totallen_word = len(complete.split())
    
    return [frequency, 
            prefixlen_char, suffixlen_char, totallen_char,
            prefixlen_word, suffixlen_word, totallen_word]

queries['candid_freq'], queries['prefixlen_char'], queries['suffixlen_char'], queries['totallen_char'], queries['prefixlen_word'], queries['suffixlen_word'], queries['totallen_word'] = zip(*queries.apply(lambda query_row: get_other_features(query_row.Prefix, query_row.Suffix, historical_dict), axis=1))

In [None]:
queries.head

In [None]:
queries.to_pickle(r'/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/artif_queries_dataset_big.pickle')