In [1]:
import pandas as pd
import glob
import re
import collections
import numpy as np
import matplotlib.pyplot as plt
import gzip
import datetime

from tqdm import tqdm_notebook
from nltk.util import ngrams
from wordcloud import WordCloud
from IPython.display import display, HTML

import spacy
from spacy import displacy
from collections import Counter

nlp = spacy.load('en_core_web_sm')

In [2]:
# Debug
print(datetime.datetime.now().time())

22:29:13.013116


# Create dataset

In [3]:
# Generate features for dataset
#df = pd.concat([pd.read_csv(f, delimiter='\t') for f in glob.glob('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/data/user-ct-test-collection-*.txt')])

## Sampled dataset
This sample dataset is put on 1.000.000.

In [4]:
#samples = df.sample(1000000, random_state=23)

#### Save samples in pickle file & CSV

In [5]:
#samples.to_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/sample_million.pickle')

In [6]:
samples = pd.read_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/sample_million.pickle')

# Create N-gram & NER features

In [7]:
def clean_candidate(candidate):
    line = re.sub(r"[,.;@#?!&$]+\ *", " ", str(candidate))
    return line

samples['Query_clean'] = samples[samples.notnull()]['Query'].map(lambda query: clean_candidate(query))

In [8]:
suffixes = []

for row in samples.itertuples():
    words = str(row.Query_clean).split()
    for j in range(0, len(words)):
        suffix = " ".join(words[j:])
        suffixes.append(suffix)

#### Save it

In [9]:
#f = gzip.GzipFile("/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/suffixes.npy.gz", "w")
#np.save(file=f, arr=suffixes)
#f.close()

#### Load it

In [10]:
# f = gzip.GzipFile('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/pickles/suffixes.npy.gz', "r")
# suffixes = np.load(f)

#### Create N-gram features

In [11]:
historical_dict = collections.Counter(suffixes)

def ngram_freq_per_n(candidate, historical_dict, n):
    words = candidate.split()
    ngram_n = 0
    ngrams_i = ngrams(words, n)
    
    for word in ngrams_i:
        freq_g = historical_dict[" ".join(word)]
        ngram_n += freq_g
        
    return ngram_n

#### Save it

In [12]:
#f = gzip.GzipFile("/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/historical_dict.npy.gz", "w")
#np.save(file=f, arr=historical_dict)
#f.close()

#### Load it

In [13]:
# f = gzip.GzipFile('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/pickles/historical_dict.npy.gz', "r")
# historical_dict = np.load(f)

In [14]:
ngram_max = 6

for i in range(1, ngram_max + 1):
    ngram_name = 'ngram_' + str(i)
    samples[ngram_name] = samples[samples.notnull()]['Query_clean'].map(lambda candidate_row: ngram_freq_per_n(candidate_row, historical_dict, i))

#### Create NER Features

In [15]:
def ner_contains_and_count_norm(candidate): 
    entities = nlp(str(candidate))
    contains = 0
    ner_norm = 0
    
    if len(entities) > 0:
        contains = 1
        ner_norm = len(entities) / len(candidate)
    
    return [contains, ner_norm]

samples['has_ne'], samples['ne_norm'] = zip(*samples[samples.notnull()]['Query_clean'].map(lambda candidate_row: ner_contains_and_count_norm(candidate_row)))

# Create other features

In [16]:
def create_artificial_queries(samples): 
    artif_list = []

    for row in samples.itertuples():
        words = str(row.Query_clean).split()
        # iterate #-filtered times
        for j in range(1, len(words)):
            # Last two will be filled as [6]: 'prefix', [7]'suffix'
            temp_list = [row.Index, row.AnonID, row.QueryTime, row.ItemRank, row.ClickURL, row.Query, row.Query_clean, '', '']
            prefix = " ".join(words[:j])
            temp_list[6] = prefix
            suffix = " ".join(words[j:])
            temp_list[7] = suffix
            # Add to artificial query list
            artif_list.append(temp_list)   
    return artif_list

artif_list = create_artificial_queries(samples)
queries = pd.DataFrame.from_records(artif_list)
queries.columns = ['Index', 'AnonID', 'QueryTime', 'ItemRank', 'ClickURL', 'Query', 'Query_clean', 'Prefix', 'Suffix']

#### Create all prefix and suffix combinations for queries

In [17]:
# filter out all queries shorter than 2
def split_words(string):
    words = str(string).split()
    return len(words)

samples['filtered'] = samples[samples.notnull()]['Query_clean'].map(lambda query: split_words(query))
samples = samples[samples.filtered > 1]

#### Create the other features

In [19]:
def get_other_features(prefix, suffix, historical_dict):     
    # The complete query
    complete = ""
#     if bool_space:
#         complete = prefix + suffix
#     else:
#         complete = prefix + " " + suffix
    
    # The frequency of the candidate in the historical logs
    frequency = historical_dict[complete]
    
    # Prefix, suffix and total length in characters
    prefixlen_char = len(prefix)
    suffixlen_char = len(suffix)
    totallen_char = len(complete)
    
    # Prefix, suffix and total length in words
    prefixlen_word = len(prefix.split())
    suffixlen_word = len(suffix.split())
    totallen_word = len(complete.split())
    
    return [frequency, 
            prefixlen_char, suffixlen_char, totallen_char,
            prefixlen_word, suffixlen_word, totallen_word]

queries['candid_freq'], queries['prefixlen_char'], queries['suffixlen_char'], queries['totallen_char'], queries['prefixlen_word'], queries['suffixlen_word'], queries['totallen_word'] = zip(*queries.apply(lambda query_row: get_other_features(query_row.Prefix, query_row.Suffix, historical_dict), axis=1))

In [20]:
queries.head

<bound method NDFrame.head of            Index    AnonID            QueryTime ItemRank  \
0        2291693   7450444  2006-03-26 19:50:45      NaN   
1         852212   2220588  2006-05-30 06:11:21      NaN   
2         852212   2220588  2006-05-30 06:11:21      NaN   
3         204378    516545  2006-05-07 09:10:15      NaN   
4         204378    516545  2006-05-07 09:10:15      NaN   
...          ...       ...                  ...      ...   
1782832  2775561  10719443  2006-05-10 23:22:52        1   
1782833  2775561  10719443  2006-05-10 23:22:52        1   
1782834    79589    196642  2006-04-04 12:09:49       11   
1782835   169411    401787  2006-05-31 11:11:36        1   
1782836   169411    401787  2006-05-31 11:11:36        1   

                       ClickURL                       Query   Query_clean  \
0                           NaN                  chat rooms          chat   
1                           NaN      aol keyword aol-dsl104           aol   
2                 

In [21]:
queries.to_pickle(r'/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/artif_queries_dataset_big.pickle')

In [22]:
# Debug
print(datetime.datetime.now().time())

00:06:50.605594
