In [19]:
import pandas as pd
import glob
import re
import collections
import numpy as np
import matplotlib.pyplot as plt
import gzip
import datetime
import pickle
import itertools

from tqdm.notebook import tqdm
from nltk.util import ngrams
from wordcloud import WordCloud
from IPython.display import display, HTML

import spacy
from spacy import displacy
from collections import Counter
from bisect import bisect_left

nlp = spacy.load('en_core_web_sm')

In [2]:
# Debug
print(datetime.datetime.now().time())

18:31:12.898997


# Create dataset

In [3]:
# Generate features for dataset
df = pd.concat([pd.read_csv(f, delimiter='\t') for f in glob.glob('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/data/user-ct-test-collection-*.txt')])

  if (await self.run_code(code, result,  async_=asy)):


## Sampled history dataset (for suffixes)
This sample dataset is put on 1.000.000.

In [4]:
#samples_hist = df.sample(1000000, random_state=23)

#### Save samples in pickle file

In [5]:
#samples_hist.to_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/sample_hist_1m.pickle')

In [6]:
samples_hist = pd.read_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/sample_hist_1m.pickle')

### Create suffixes
Create all possible suffixes, splitting per word iteratively, removing leading words.

Query "How to cook CHICKEN" becomes:
- how to cook chicken
- to cook chicken
- cook chicken
- chicken

Creating 4 suffixes.

All symbols are removed and changed to lowercase.

In [7]:
suffixes = []

for row in samples_hist.itertuples():
    line = re.sub(r"[^A-Za-z0-9]+", " ", str(row.Query)).lower()
    words = line.split()
    for j in range(0, len(words)):
        suffix = " ".join(words[j:])
        suffixes.append(suffix)

#### Save samples in pickle file

In [8]:
with open('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/hist_suffixes.pickle_1m', 'wb') as f:
    pickle.dump(suffixes, f)

In [9]:
with open('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/hist_suffixes.pickle_1m', 'rb') as f:
    suffixes = pickle.load(f)

## Sampled dataset (for prefixes)
This sample dataset is put on 10.000.

In [10]:
samples_data = df.sample(10000, random_state=14)

### Create prefixes
Create all possible prefixes, splitting per letter (from second word) iteratively, removing leading words.

Query "How to cook CHICKEN" becomes (_ = space):
- how
- how_
- how_t
- ...
- how_to_cook_chicken

Creating 17 prefixes.

All symbols are removed and changed to lowercase.

In [11]:
prefix_list = []

qid = 1

for row in samples_data.itertuples():
    line = re.sub(r"[^A-Za-z0-9]+", " ", str(row.Query)).lower()
    
    # Ignore empty strings
    if len(line.split()) > 1:   
        firstword = line.split()[0]
    
        for j in range(len(firstword) + 2 , len(line)+1):
            if (len(line[0:j].split())) <= 0:
                print(line)
                print('m' + str(line[0:j]) + 'm')
            # temp list will be filled as [0]: 'clean_query', [1]: 'qid', [2]: 'prefix'
            temp_list = ['', '', '']
            temp_list[0] = line
            temp_list[1] = "qid:" + str(qid)
            temp_list[2] = line[0:j]
            # Add to prefix query list
            prefix_list.append(temp_list)
            qid += 1
    
prefix_queries = pd.DataFrame.from_records(prefix_list)
prefix_queries.columns = ['Query_clean', 'Qid', 'Prefix']

#### Save samples in pickle file

In [12]:
prefix_queries.to_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/prefixes_1m.pickle')

In [13]:
prefix_queries = pd.read_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/prefixes_1m.pickle')

## Create synthetic queries
Combine end term of prefix with top 10 suffixes 

In [14]:
suffix_dict = collections.Counter(suffixes)
suffix_list = suffix_dict.most_common()
suff_set_sorted = sorted([i[0] for i in suffix_list])

In [20]:
# Check if it contains suffix
def bisect_contains_check(suffix_list, prefix):
    try:
        return suffix_list[bisect_left(suffix_list, prefix)].startswith(prefix)
    except IndexError:
        return False

# Returns the prefix keys
def bisect_list_slice(suffix_list, prefix):
    return suffix_list[bisect_left(suffix_list, prefix):
         bisect_left(suffix_list, prefix[:-1] + chr(ord(prefix[-1])+1))]

candidate_list = []

for row in tqdm(prefix_queries.itertuples(), total=prefix_queries.shape[0]):
    words = row.Prefix.split()
    endterm = words[-1]
    no_endterm = " ".join(words[:-1])
    
    if (bisect_contains_check(suff_set_sorted, endterm)):
        temp_keys = bisect_list_slice(suff_set_sorted, endterm)
        
        temp_suffix_dict = Counter()
        
        for key in temp_keys:
            temp_suffix_dict[key] = suffix_dict.get(key)
            
        temp_suffix_list = temp_suffix_dict.most_common()[:10]
        
        for j in temp_suffix_list: 
            # Last four will be filled as [5]: 'suffix', [6]: 'Hist_Suffix_freq', [7]: 'Synthetic_query' [8]: 'matching'
            temp_list = [row.Query_clean, row.Qid, row.Prefix, '', '', '', '']
            temp_list[3] = j[0]
            temp_list[4] = j[1]
            temp_list[5] = str(no_endterm + " " + str(j[0]))
            temp_list[6] = 0
            
            if str(row.Query_clean) == str(no_endterm + " "+ str(j[0])):
                temp_list[6] = 1
            
            # Add to synthetic query list
            candidate_list.append(temp_list)
    
syn_candidate_queries = pd.DataFrame.from_records(candidate_list)
syn_candidate_queries.columns = ['Query_clean', 'Qid', 'Prefix', 'Suffix', 'Hist_Suffix_freq', 'Synthetic_query', 'Synthetic_match']    
    
syn_candidate_queries.head

HBox(children=(FloatProgress(value=0.0, max=107409.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
syn_candidate_queries

In [None]:
temp_df = syn_candidate_queries.loc[syn_candidate_queries['Synthetic_match'] == 1]
temp_df.head

#### Save samples in pickle file

In [None]:
prefix_queries.to_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/syn_candidate_queries_5m.pickle')

In [None]:
prefix_queries = pd.read_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/syn_candidate_queries_5m.pickle')

In [None]:
#suffix_dict.get('0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0')
print(len(suff_set_sorted))
print(len(suffixes))
print(suff_set_sorted[:10])

In [None]:
# Debug
print(datetime.datetime.now().time())