In [1]:
import pandas as pd
import glob
import re
import collections
import numpy as np
import matplotlib.pyplot as plt
import gzip
import datetime
import pickle
import itertools

from tqdm import tqdm_notebook
from nltk.util import ngrams
from wordcloud import WordCloud
from IPython.display import display, HTML

import spacy
from spacy import displacy
from collections import Counter
from bisect import bisect_left

nlp = spacy.load('en_core_web_sm')

In [2]:
# Debug
print(datetime.datetime.now().time())

17:22:56.694599


# Create dataset

In [5]:
# Generate features for dataset
df = pd.concat([pd.read_csv(f, delimiter='\t') for f in glob.glob('data/user-ct-test-collection-*.txt')])

  if (await self.run_code(code, result,  async_=asy)):


## Sampled history dataset (for suffixes)
This sample dataset is put on 1.000.000.

In [7]:
samples_hist = df.sample(1000000, random_state=23)

#### Save samples in pickle file

In [9]:
samples_hist.to_pickle('data/sample_hist_1m.pickle')

In [3]:
# samples_hist = pd.read_pickle('created_sample/sample_hist_1m.pickle')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/sample_hist_1m.pickle'

### Create suffixes
Create all possible suffixes, splitting per word iteratively, removing leading words.

Query "How to cook CHICKEN" becomes:
- how to cook chicken
- to cook chicken
- cook chicken
- chicken

Creating 4 suffixes.

All symbols are removed and changed to lowercase.

In [10]:
suffixes = []

for row in samples_hist.itertuples():
    line = re.sub(r"[^A-Za-z0-9]+", " ", str(row.Query)).lower()
    words = line.split()
    for j in range(0, len(words)):
        suffix = " ".join(words[j:])
        suffixes.append(suffix)

#### Save samples in pickle file

In [12]:
with open('data/hist_suffixes.pickle_1m', 'wb') as f:
    pickle.dump(suffixes, f)

In [339]:
with open('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/hist_suffixes.pickle_1m', 'rb') as f:
    suffixes = pickle.load(f)

## Sampled dataset (for prefixes)
This sample dataset is put on 10.000.

In [13]:
samples_data = df.sample(10000, random_state=14)

### Create prefixes
Create all possible prefixes, splitting per letter (from second word) iteratively, removing leading words.

Query "How to cook CHICKEN" becomes (_ = space):
- how
- how_
- how_t
- ...
- how_to_cook_chicken

Creating 17 prefixes.

All symbols are removed and changed to lowercase.

In [24]:
prefix_list = []

qid = 1

for row in samples_data.itertuples():
    line = re.sub(r"[^A-Za-z0-9]+", " ", str(row.Query)).lower()
    
    # Ignore empty strings
    if len(line.split()) > 1:   
        firstword = line.split()[0]
    
        for j in range(len(firstword) + 2 , len(line)+1):
            if (len(line[0:j].split())) <= 0:
                print(line)
                print('m' + str(line[0:j]) + 'm')
            # temp list will be filled as [0]: 'clean_query', [1]: 'qid', [2]: 'prefix'
            temp_list = ['', '', '']
            temp_list[0] = line
            temp_list[1] = "qid:" + str(qid)
            temp_list[2] = line[0:j]
            # Add to prefix query list
            prefix_list.append(temp_list)
            qid += 1
    
prefix_queries = pd.DataFrame.from_records(prefix_list)
prefix_queries.columns = ['Query_clean', 'Qid', 'Prefix']

In [25]:
prefix_queries

Unnamed: 0,Query_clean,Qid,Prefix
0,joel osteen screensaver,qid:1,joel o
1,joel osteen screensaver,qid:2,joel os
2,joel osteen screensaver,qid:3,joel ost
3,joel osteen screensaver,qid:4,joel oste
4,joel osteen screensaver,qid:5,joel ostee
...,...,...,...
106428,naked charmed ones,qid:106429,naked charmed
106429,naked charmed ones,qid:106430,naked charmed o
106430,naked charmed ones,qid:106431,naked charmed on
106431,naked charmed ones,qid:106432,naked charmed one


#### Save samples in pickle file

In [26]:
prefix_queries.to_pickle('data/prefixes_1m.pickle')

In [343]:
prefix_queries = pd.read_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/prefixes_1m.pickle')

## Create synthetic queries
Combine end term of prefix with top 10 suffixes 

In [27]:
suffix_dict = collections.Counter(suffixes)
suffix_list = suffix_dict.most_common()
suff_set_sorted = sorted([i[0] for i in suffix_list])

['0',
 '0 0',
 '0 0 0 0 0 098 9p 9898',
 '0 0 0 0 00000008521544857767 7 67676876577 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa777777777777777777777777777777777 q',
 '0 0 0 0 098 9p 9898',
 '0 0 0 00000008521544857767 7 67676876577 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa777777777777777777777777777777777 q',
 '0 0 0 098 9p 9898',
 '0 0 00000008521544857767 7 67676876577 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa777777777777777777777777777777777 q',
 '0 0 098 9p 9898',
 '0 0 1',
 '0 0 3',
 '0 0 5 m',
 '0 0 linkurl array linkblank array',
 '0 0 p',
 '0 000 00000000000000000000000000000000000000000000000000 000000000000000000000000000000',
 '0 000000000000000000000000',
 '0 00000008521544857767 7 67676876577 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa777777777777777777777777777777777 q',
 '0 0000011100000000000000000000 7 447',
 '0 01292 0 020514 t h',
 '0 02',
 '0 02 2 10 of a gram of alcohol per 100 milliliters of blood the drinker may feel relaxed and carefree',
 '0 020514 t h',
 '0 03',
 '0 03 from sun microsystems

In [38]:
# Check if it contains suffix
def bisect_contains_check(suffix_list, prefix):
    try:
        return suffix_list[bisect_left(suffix_list, prefix)].startswith(prefix)
    except IndexError:
        return False

# Returns the prefix keys
def bisect_list_slice(suffix_list, prefix):
    return suffix_list[bisect_left(suffix_list, prefix):
         bisect_left(suffix_list, prefix[:-1] + chr(ord(prefix[-1])+1))]

candidate_list = []

for row in tqdm_notebook(prefix_queries.itertuples(), total=prefix_queries.shape[0]):
    words = row.Prefix.split()
    endterm = words[-1]
    no_endterm = " ".join(words[:-1])
    
    if (bisect_contains_check(suff_set_sorted, endterm)):
        temp_keys = bisect_list_slice(suff_set_sorted, endterm)
        
        temp_suffix_dict = Counter()
        
        for key in temp_keys:
            temp_suffix_dict[key] = suffix_dict.get(key)
            
        temp_suffix_list = temp_suffix_dict.most_common()[:10]
        
        for j in temp_suffix_list: 
            # Last four will be filled as [5]: 'suffix', [6]: 'Hist_Suffix_freq', [7]: 'Synthetic_query' [8]: 'matching'
            temp_list = [row.Query_clean, row.Qid, row.Prefix, '', '', '', '']
            temp_list[3] = j[0]
            temp_list[4] = j[1]
            temp_list[5] = str(no_endterm + " " + str(j[0]))
            temp_list[6] = 0
            
            if str(row.Query_clean) == str(no_endterm + " "+ str(j[0])):
                temp_list[6] = 1
            
            # Add to synthetic query list
            candidate_list.append(temp_list)
    
syn_candidate_queries = pd.DataFrame.from_records(candidate_list)
syn_candidate_queries.columns = ['Query_clean', 'Qid', 'Prefix', 'Suffix', 'Hist_Suffix_freq', 'Synthetic_query', 'Synthetic_match']    
    
syn_candidate_queries.head

HBox(children=(IntProgress(value=0, max=106433), HTML(value='')))




<bound method NDFrame.head of                     Query_clean         Qid              Prefix  \
0       joel osteen screensaver       qid:1              joel o   
1       joel osteen screensaver       qid:1              joel o   
2       joel osteen screensaver       qid:1              joel o   
3       joel osteen screensaver       qid:1              joel o   
4       joel osteen screensaver       qid:1              joel o   
...                         ...         ...                 ...   
973964       naked charmed ones  qid:106433  naked charmed ones   
973965       naked charmed ones  qid:106433  naked charmed ones   
973966       naked charmed ones  qid:106433  naked charmed ones   
973967       naked charmed ones  qid:106433  naked charmed ones   
973968       naked charmed ones  qid:106433  naked charmed ones   

                      Suffix  Hist_Suffix_freq  \
0                        org              8693   
1                       ohio              1346   
2              

In [37]:
# def printendterm(prefix):
#     words = prefix.split()
#     endterm = words[-1]
#     no_endterm = " ".join(words[:-1])
#     print(endterm)
    
# printendterm("cheapest flight fro")
# printendterm("cheapest flight from")
# printendterm("cheapest flight from ")
# printendterm("cheapest flight from n")

fro
from
from
n


In [39]:
syn_candidate_queries

Unnamed: 0,Query_clean,Qid,Prefix,Suffix,Hist_Suffix_freq,Synthetic_query,Synthetic_match
0,joel osteen screensaver,qid:1,joel o,org,8693,joelorg,0
1,joel osteen screensaver,qid:1,joel o,ohio,1346,joelohio,0
2,joel osteen screensaver,qid:1,joel o,online,1103,joelonline,0
3,joel osteen screensaver,qid:1,joel o,om,930,joelom,0
4,joel osteen screensaver,qid:1,joel o,of america,785,joelof america,0
...,...,...,...,...,...,...,...
973964,naked charmed ones,qid:106433,naked charmed ones,onesies,2,naked charmedonesies,0
973965,naked charmed ones,qid:106433,naked charmed ones,onesite realpage com,2,naked charmedonesite realpage com,0
973966,naked charmed ones,qid:106433,naked charmed ones,onestepahead com,2,naked charmedonestepahead com,0
973967,naked charmed ones,qid:106433,naked charmed ones,ones 2n8n9n,1,naked charmedones 2n8n9n,0


In [None]:
temp_df = syn_candidate_queries.loc[syn_candidate_queries['Synthetic_match'] == 1]
temp_df.head

#### Save samples in pickle file

In [323]:
prefix_queries.to_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/syn_candidate_queries_5m.pickle')

In [324]:
prefix_queries = pd.read_pickle('/Users/rwkoops/PycharmProjects/IR_project/IR_project_02/created_sample/syn_candidate_queries_5m.pickle')

In [332]:
#suffix_dict.get('0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0')
print(len(suff_set_sorted))
print(len(suffixes))
print(suff_set_sorted[:10])

5233272
13923638
['0', '0 0', '0 0 0', '0 0 0 0', '0 0 0 0 0', '0 0 0 0 0 0', '0 0 0 0 0 0 0', '0 0 0 0 0 0 0 0', '0 0 0 0 0 0 0 0 0', '0 0 0 0 0 0 0 0 0 0']


In [22]:
# Debug
print(datetime.datetime.now().time())

00:06:50.605594
