In [1]:
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
#from __future__ import division
#import matplotlib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")
from string import punctuation

#remove "num", because "num" has the highest term frequency(45538) at the orinal file, 
#the second most frequent term only has a freq. of 3750
esw = esw + ['abstract', 'ci', 'hr','l','pubmed', 'num'] 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\D070678\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from collections import defaultdict
#from gensim import corpora

In [4]:
import datetime

In [5]:
####read file

#read train.docs file
train_doc_file = pd.read_csv("train.docs", encoding = 'utf-8', sep='\t', header=None)
train_doc_file.columns = ['id', 'text']

#read train.nontopic-titles.queries file
train_query_file = pd.read_csv("train.nontopic-titles.queries", encoding = 'utf-8', sep='\t', header=None)
train_query_file.columns = ['id', 'text']

#read example query file, only one query
#train_query_file = pd.read_csv("example.queries", encoding = 'utf-8', sep='\t', header=None)
#train_query_file.columns = ['id', 'text']

      
    

# Part 1: build functions for Tiered Index IR method

In [6]:

## def preprocessing functions which fit with our dataset
# input file: train_doc_file["text"]/ train_query_file["text"]

def removeStopwords(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in esw])
        file[i] = el
        i += 1
def removePunctuation(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in punctuation])
        file[i] = el
        i += 1

def stemming(file):
    ps = PorterStemmer()
    i = 0
    for el in file:
        el = ' '.join([ps.stem(word) for word in el.lower().split() ])
        file[i] = el
        i += 1
  

In [7]:
# run this before function getTieredIndex(doc_File, query_File)
#get the term frequency for specific term in a list

def getTermFrequency(word, wordList):
    
    if word in wordList:
        frequency = defaultdict(int)
        for el in wordList:
            frequency[el]+=1

        return frequency[word]  
    else:
        print("Error:", word, "is not in the wordList")
        
        
    

In [8]:
#@ run this before function getTieredIndex(doc_File, query_File)

# sort the docs in for each word by term frequency
# return a sorted tiers dictionary for each token in query file


def sortingDocByTF(vocabulary, docs_clustering ):
    
    sorted_one_level_tier = defaultdict(list)
    for token in vocabulary:
        m = docs_clustering[token]
        l = []
        r = []
        for i in range(len(m)):
            l.append(m[i][0])
            r.append(m[i][1])

        new_list = []
        while m:  
            # find index of maximum item
            max_index = r.index(max(r)) 

            # remove item with pop() and append to sorted list
            r.pop(max_index)
            new_list.append(m.pop(max_index))

        sorted_one_level_tier[token] = new_list
    return sorted_one_level_tier

In [10]:
### create DTM

 ## get DTM, weighted by tfidf, the sqrt of each doc vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

def get_DTM_tfidf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    #print(vocabulary)

    tfidf_transformer = TfidfTransformer()
    
    X_train_tfidf = CountVectorizer(vocabulary = vocabulary)
    X_train_tfidf = X_train_tfidf.fit_transform(file)
    
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_tfidf)
    return X_train_tfidf


## get DTM, weighted by term frequency
def get_DTM_tf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)
    vocabulary = list(train_count_vect.vocabulary_.keys())

    X_train_tf = CountVectorizer(vocabulary = vocabulary)
    X_train_tf = X_train_tf.fit_transform(file)
    return X_train_tf


In [11]:
###Generate the query vector
# get_QueryVector_tfidf helps to get the tiidf weighted query vector, the sqrt of the query vector is 1
# therefore, in the retrive phase, 
# dot product of the doc vector and query vector can be used to represent the cosine similarity

def get_QueryVector_tfidf(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())/np.linalg.norm(list(dict(frequency).values()))
        
    else:
        tfidf_transformer = TfidfTransformer()
        
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
        query_vect = tfidf_transformer.fit_transform(query_vect)
    return query_vect



###Generate the query vector, weighted by term frequency
def get_QueryVector(queryFile, docFile):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    
    if type(queryFile) is str:
        query_vect = []
        query = queryFile
        query = query.split()
        frequency = defaultdict(int)
        for el in vocabulary:
            if el in query:
                frequency[el]+= 1
            else:
                frequency[el] = 0
        query_vect = list(dict(frequency).values())
        
    else:
        query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
        query_vect = query_vect.fit_transform(queryFile)
        
    return query_vect

In [9]:
# retrun: a defaultdict, it stores for each term, the docID of docs that contain the word
# doc_File: the orgional imported doc file
# query_File: the origional imported query file

def getTieredIndex(doc_File, query_File):
    
    documents_doc = doc_File['text']
    documents_id = list(doc_File['id'])
    documents_query = query_File['text']
    
    
    if type(documents_query) is str:
        print("Error: the input query_File should be an interate file") 
        
    else: 
        ##################### create doc tiers for each term in each query ###############################
        
        texts_doc = [[word for word in document.split()] for document in documents_doc] 
        texts_query = [[word for word in document.split()] for document in documents_query]

        vocabulary = list()
        for text in texts_query:
            for word in text:
                vocabulary.append(word)
        vocabulary = list(set(vocabulary)) #get vocabulary list of query file        
        
        frequency = defaultdict(int) #initiate dictionary

        for text in texts_query:
             for token in text:
                frequency[token] += 1

        tokens_query = [[token for token in text if frequency[token] > 0]
              for text in texts_query]

        docs_clustering = defaultdict(list)

        ### for each term in vocabulary, get all of the (doc, tf) pairs, docs are docs that contain the term 
        for word in vocabulary:
            for i in range(len(texts_doc)):
                if word in texts_doc[i]:
                    docs_clustering[word].append([documents_id[i], getTermFrequency(word, texts_doc[i])]) 
    
        sorted_one_level_tier = sortingDocByTF(vocabulary, docs_clustering )
    
        return sorted_one_level_tier        


In [12]:
#@ run this before function IRByTieredIndex(query_File, doc_Tiers, query_vect, DTM_tfidf, doc_File ):

# return the minium intersection of tieres list
# input d: is a list of tieres, each tier is the index list of doc which contains a term of a query

def intersect(d):
    resultsList = list()
    l= []
    if len(d)> 0:
        
        result = d[0]
        for el in d:
            
            if len(result) >0:
                resultsList.append(result)
                result = set(result).intersection(el)
            else:
                result = resultsList[len(resultsList)-1]
        #print(resultsList)
        if len(resultsList) >0:
            for el in resultsList:
                l.append(len(el))
            #print(resultsList[np.argmin(l)])
            return(resultsList[np.argmin(l)])
    else:
        for el in d:
            l.append(len(el))    
        return(d[np.argmin(l)])
        
            

In [13]:
### retrieve through tiered index model
# get the doc and similarity for each query

# doc_File: the orgional imported doc file
# query_File: the origional imported query file
# doc_Tiers: output of the function getTieredIndex(doc_File, query_File)

# query_vect: tf weighted query-term matrix
# DTM_tfidf: tfidf weighted DTM
    

def IRByTieredIndex(query_File, doc_Tiers, doc_File ):
    
    documents_id = list(doc_File['id'])
    query_id = list(query_File['id'])
    
    query_vect = get_QueryVector_tfidf(query_File.text, doc_File.text)
    DTM_tfidf = get_DTM_tfidf(doc_File.text)
    
    query_file = query_File['text']
    doc_tiers = doc_Tiers
    
    if type(query_file) is str:
        print("Error: the input query_File should be an interate text file") 
        
    else: 
        r =[]
        
        texts_query = [[word for word in q.split()] for q in query_file]
        
        # get vocabulary list for each query 
        for i in range(len(texts_query)):
            texts_query[i] = list(set(texts_query[i]))
        
        
        for i in range(len(texts_query)):
            
            docs_all = [] 
            for word in texts_query[i]:
                tieres = []
                for d in doc_Tiers[word]:
                    tieres.append(d[0])
                if len(tieres)>0:
                    docs_all.append(tieres)
            #print(len(docs_all))
            #print(docs_all)
            if len(docs_all) >0:

                results = list(intersect(docs_all))

                #print(results)

                if len(results) >0:

                    sims = []
                    sims_docs = []
                    for el in results:
                        query_vector = query_vect[i]
                        doc_vector = DTM_tfidf[documents_id.index(el)]
                        sims.append(np.dot(query_vector, doc_vector.transpose())[0,0])
                        sims_docs.append(documents_id.index(el))

                    sims, sims_docs= zip(*sorted(zip(sims, sims_docs), reverse=True))# rank the results 
                    for x in range(len(sims)):
                        if sims[x] >0:
                            
                            r.append([str(query_id[i]), str(documents_id[sims_docs[x]]), sims[x]])
                            #print(str(query_id[i]), 0 ,str(documents_id[sims_docs[x]]), sims[x])
                            
        return r   


# Part 2: get functions run!
1. preprocessing
2. get tieres
3. IRByTieredIndex

# a. without being stemmed
texts are not stemmed

In [14]:

### 1. preprocessing

train_doc_file["text"] = train_doc_file['text'].str.replace('/', ' or ')

train_doc_file["text"] = train_doc_file['text'].str.replace('-', ' and ')

train_query_file["text"] = train_query_file['text'].str.replace('/', ' or ')

train_query_file["text"] = train_query_file['text'].str.replace('-', ' and ')

## remove stopwords
removeStopwords(train_doc_file['text'])
removeStopwords(train_query_file['text'])

### remove punctuation
removePunctuation(train_doc_file['text'])
removePunctuation(train_query_file['text'])

In [15]:
### 2. get tieres

### test for the function getTieredIndex(doc_File, query_File)
# without being stemmed
doc_Tiers = getTieredIndex(train_doc_file, train_query_file)
doc_Tiers

defaultdict(list,
            {'within': [['MED-5007', 3],
              ['MED-1567', 1],
              ['MED-1610', 1],
              ['MED-1630', 1],
              ['MED-3359', 1],
              ['MED-3908', 1],
              ['MED-4104', 1],
              ['MED-4107', 1],
              ['MED-4129', 1],
              ['MED-4140', 1],
              ['MED-4172', 1],
              ['MED-4250', 1],
              ['MED-4253', 1],
              ['MED-5140', 1]],
             'deep': [['MED-5269', 6],
              ['MED-2418', 5],
              ['MED-759', 2],
              ['MED-1027', 2],
              ['MED-2697', 2],
              ['MED-3808', 2],
              ['MED-855', 1],
              ['MED-1467', 1],
              ['MED-2185', 1],
              ['MED-2195', 1],
              ['MED-2201', 1],
              ['MED-2797', 1],
              ['MED-3020', 1],
              ['MED-4198', 1],
              ['MED-4471', 1],
              ['MED-4811', 1],
              ['MED-4916', 1],
    

In [16]:
### 3. IRByTieredIndex

####################IRByTieredIndex result1, need to get the performance result #########################

# without being stemmed, "-" converted to "and"

# test the IR on the whole query file
t1 =  datetime.datetime.now()
IR_results = IRByTieredIndex(train_query_file, doc_Tiers, train_doc_file )
t2 =  datetime.datetime.now()
t = t2-t1

print("running time:", t )
# save result
df = pd.DataFrame(IR_results, columns = ['QUERY_ID', 'DOC_ID', 'sim_results'])

df.to_csv('IRByTieredIndex result1.txt', header=None, index=None, sep=' ', mode='a')
df


running time: 0:00:34.025791


Unnamed: 0,QUERY_ID,DOC_ID,sim_results
0,PLAIN-10,MED-2054,0.310176
1,PLAIN-10,MED-1985,0.307548
2,PLAIN-10,MED-2058,0.295657
3,PLAIN-10,MED-2475,0.284176
4,PLAIN-10,MED-3601,0.280856
5,PLAIN-10,MED-3774,0.274741
6,PLAIN-10,MED-1760,0.272883
7,PLAIN-10,MED-2494,0.265818
8,PLAIN-10,MED-5005,0.257712
9,PLAIN-10,MED-3150,0.255360


# b. Stemmed 
texts are stemmed

In [17]:
### remove stopwords

removeStopwords(train_doc_file['text'])
removeStopwords(train_query_file['text'])

### stemming
stemming(train_doc_file['text'])
stemming(train_query_file['text'])


### remove punctuation
removePunctuation(train_doc_file['text'])
removePunctuation(train_query_file['text'])

In [18]:

### test for the function getTieredIndex(doc_File, query_File)
#stemmed

doc_Tiers_stemmed = getTieredIndex(train_doc_file, train_query_file)
doc_Tiers_stemmed

defaultdict(list,
            {'within': [['MED-5007', 3],
              ['MED-1567', 1],
              ['MED-1610', 1],
              ['MED-1630', 1],
              ['MED-3359', 1],
              ['MED-3908', 1],
              ['MED-4104', 1],
              ['MED-4107', 1],
              ['MED-4129', 1],
              ['MED-4140', 1],
              ['MED-4172', 1],
              ['MED-4250', 1],
              ['MED-4253', 1],
              ['MED-5140', 1]],
             'deep': [['MED-5269', 6],
              ['MED-2418', 5],
              ['MED-759', 2],
              ['MED-1027', 2],
              ['MED-2697', 2],
              ['MED-3808', 2],
              ['MED-855', 1],
              ['MED-1467', 1],
              ['MED-2185', 1],
              ['MED-2195', 1],
              ['MED-2201', 1],
              ['MED-2797', 1],
              ['MED-3020', 1],
              ['MED-4198', 1],
              ['MED-4471', 1],
              ['MED-4811', 1],
              ['MED-4916', 1],
    

In [19]:
####################IRByTieredIndex result2, need to get the performance result #########################

# stemmed, "-" converted to "and"

print("please be patient, this process requires about half a mintute")

# test the IR on the whole query file
t1 =  datetime.datetime.now()
IR_results = IRByTieredIndex(train_query_file, doc_Tiers_stemmed, train_doc_file )
t2 =  datetime.datetime.now()
t = t2-t1

print("running time:", t )
# save result
df = pd.DataFrame(IR_results, columns = ['QUERY_ID', 'DOC_ID', 'sim_results'])
df.to_csv('IRByTieredIndex result2.txt', header=None, index=None, sep=' ', mode='a')
df


running time: 0:00:30.912552


Unnamed: 0,QUERY_ID,DOC_ID,sim_results
0,PLAIN-10,MED-2494,0.374199
1,PLAIN-10,MED-2054,0.341844
2,PLAIN-10,MED-1985,0.335333
3,PLAIN-10,MED-2058,0.333037
4,PLAIN-10,MED-3601,0.320036
5,PLAIN-10,MED-2475,0.311803
6,PLAIN-10,MED-3774,0.303089
7,PLAIN-10,MED-3150,0.281433
8,PLAIN-10,MED-1760,0.279506
9,PLAIN-10,MED-5101,0.278249


# --------------------------------------------------------------------------------------------------------------

# Don't Run Code Below 

In [None]:
### create tfidf weighted DTM for the train.docs file
train_tfidf_stemmed = get_DTM_tfidf(train_doc_file.text)
train_tfidf_stemmed

In [15]:
### create tfidf weighted DTM for the train.docs file
train_tfidf = get_DTM_tfidf(train_doc_file.text)
train_tfidf

<3612x23449 sparse matrix of type '<class 'numpy.float64'>'
	with 310355 stored elements in Compressed Sparse Row format>

In [None]:
### create query vector matrix for the train.nontopic-titles.queries file 
query_vect = get_QueryVector(train_query_file.text, train_doc_file.text)
query_vect

In [None]:
### create query vector matrix for the train.nontopic-titles.queries file 
query_vect_stemmed = get_QueryVector(train_query_file.text, train_doc_file.text)
query_vect_stemmed

In [7]:
######### Don't Run ############################

#texts = [[word for word in document.split()] for document in train_doc_file['text']]

#frequency = defaultdict(int)


#for text in texts:
     #for token in text:
        #frequency[token] += 1

#texts = [[token for token in text if frequency[token] > 0]
          #for text in texts]

#dictionary = corpora.Dictionary(texts)
#corpus = [dictionary.doc2bow(text) for text in texts]

In [76]:
######### Don't Run ############################

# return the minium intersection of tieres list
# input d: is a list of tieres, each tier is the index list of doc which contains a term of a query

def intersect(d):
    if len(d)> 0:
        resultsList = list()
        result = d[0]
        for el in d:
            if len(result) >=10 :
                r = result
                result = set(result).intersection(el)
                if len(result)==0:
                    return r
                    break
                
                
        

In [8]:
######### Don't Run ############################
#vocabulary = list()
#texts_query = [[word for word in document.split()] for document in train_query_file['text']]
#for text in texts_query:
    #for word in text:
        #vocabulary.append(word)
#vocabulary = list(set(vocabulary))
#vocabulary