In [None]:
from __future__ import division
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
import matplotlib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
nltk.download("stopwords")
from nltk.corpus import stopwords
esw = stopwords.words("english")
from string import punctuation

#remove "num", because "num" has the highest term frequency(45538) at the orinal file, 
#the second most frequent term only has a freq. of 3750
esw = esw + ['abstract', 'ci', 'hr','l','pubmed', 'num'] 

In [None]:
####read file

#read train.docs file
train_doc_file = pd.read_csv("train.docs", encoding = 'utf-8', sep='\t', header=None)
train_doc_file.columns = ['id', 'text']

#read train.nontopic-titles.queries file
train_query_file = pd.read_csv("train.nontopic-titles.queries", encoding = 'utf-8', sep='\t', header=None)
train_query_file.columns = ['id', 'text']



### preprocessing

train_doc_file["text"] = train_doc_file['text'].str.replace('/', ' or ')
train_query_file["text"] = train_doc_file['text'].str.replace('/', ' or ')

def removeStopwords(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in esw])
        file[i] = el
        i += 1
def removePunctuation(file):
    i = 0
    for el in file:
        el = ' '.join([word for word in el.lower().split() if word not in punctuation])
        file[i] = el
        i += 1

            
    

In [None]:
### remove stopwords
removeStopwords(train_doc_file['text'])
removeStopwords(train_query_file['text'])

### remove punctuation
removePunctuation(train_doc_file['text'])
removePunctuation(train_query_file['text'])

In [None]:
### create DTM

 ## get DTM, weighted by tfidf
def get_DTM_tfidf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    return X_train_tfidf


## get DTM, weighted by tfidf
def get_DTM_tf(file):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(file)

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    return X_train_tf


In [None]:
### create tfidf weighted DTM for the train.docs file
train_tfidf = get_DTM_tfidf(train_doc_file.text)
train_tfidf

In [None]:
###Generate the query vector
def get_QueryVector(queryFile, docFile=train_doc_file.text):
    train_count_vect = CountVectorizer() #remove english stopwords
    X_train_counts = train_count_vect.fit_transform(docFile)
    vocabulary = list(train_count_vect.vocabulary_.keys())
    query_vect = CountVectorizer(analyzer = "word", vocabulary = vocabulary)
    query_vect = query_vect.fit_transform(queryFile)
    return query_vect

In [None]:
### create query vector matrix for the train.nontopic-titles.queries file 
query_vect = get_QueryVector(train_query_file.text, train_doc_file.text)
query_vect

In [None]:
import math
import random

In [None]:
### Preclustering through randomly selected leaders

index = []
topic_clustering = []
for i in range(int(math.sqrt(train_tfidf.shape[0]))):
    index.append(random.randint(0,train_tfidf.shape[0])) # randomly select doc leaders index
    topic_clustering.append([]) # initiate topic clustering

for i in range(train_tfidf.shape[0]):
    sims = []
    #if i not in index:
    for el in index:
        sims.append(np.dot(train_tfidf[i,], train_tfidf[el,].transpose())[0,0])
        #print(sims)    
        #maxSim = max(sims)
    maxsimindex = sims.index(max(sims))
    topic_clustering[maxsimindex].append(i)      

In [None]:
### information retrieve 
def preclusteringByRandomLeader(docFile, leaderNumber):
    
    train_tfidf = docFile
    leaderIndex = []
    topic_clustering = []
    
    for i in range(leaderNumber):
        leaderIndex.append(random.randint(0,train_tfidf.shape[0])) # randomly select doc leaders index
        topic_clustering.append([]) # initiate topic clustering

    for i in range(train_tfidf.shape[0]):
        sims = []
        #if i not in index:
        for el in leaderIndex:
            sims.append(np.dot(train_tfidf[i,], train_tfidf[el,].transpose())[0,0])
            #print(sims)    
            #maxSim = max(sims)
        maxsimindex = sims.index(max(sims))
        topic_clustering[maxsimindex].append(i) 
    
    return leaderIndex,topic_clustering 


In [None]:
def IRqueryByLeaders(leaderIndex, topic_clustering, queryVector ):    
    ### get the similarity of query with each doc leader
    index = leaderIndex
    topic_clustering = topic_clustering
    query_vector = queryVector
    
    for q in range(queryVector.shape[0]):
        
        sims_leaders = []
        for el in index:
            sims_leaders.append(np.dot(query_vector[q], train_tfidf[el,].transpose())[0,0])

       
        maxsimindex = sims_leaders.index(max(sims_leaders)) # get the most similarity clustering index
        #print(topic_clustering[maxsimindex])

        sims_docs = []
        #if len(topic_clustering[maxsimindex])> 3:
            #get_sims_docs = []
        for el in topic_clustering[maxsimindex]:
            sims_docs.append(np.dot(query_vector[q], train_tfidf[el,].transpose())[0,0]) 
            # get the similarty of query&docs in the most similarity clustering index
        
        IR_doc_sims = []
        IR_doc = []
        for i in range(len(topic_clustering[maxsimindex])):
            if sims_docs[i]>0:
                IR_doc_sims.append(sims_docs[i]) #get the non-zero similarity
                IR_doc.append(topic_clustering[maxsimindex][i]) #get the index of the docs with non-zero similarity  


        #IR_doc= []
        #for el in d:
            #IR_doc.append(topic_clustering[maxsimindex][el])


        #print(sims_leaders, sims_docs, d, IR_doc, IR_doc_sims)

        #print("Doc", "Similarity")
        for j in range(len(IR_doc)):
            print("PLAIN-"+ str(q), "0", "MED-" + str(IR_doc[j]), IR_doc_sims[j])

    #t2 = datetime.datetime.now().time()
    
    #print("time:",  t1, t2 )
    #return sims_docs
    
    #sims_docs_normalized = sims_docs/total
           
        
    

In [None]:
# get the leader docs index and the docs clusterings with clustering number =  int(math.sqrt(train_tfidf.shape[0]))
leaderIndex,topic_clustering =  preclusteringByRandomLeader( train_tfidf, int(math.sqrt(train_tfidf.shape[0])))

In [None]:
# get the leader docs index and the docs clusterings with clustering number =  int(math.sqrt(train_tfidf.shape[0]))
leaderIndex_10,topic_clustering_10 =  preclusteringByRandomLeader( train_tfidf, 10)

In [None]:
import datetime

In [None]:
# test the IR on the whole query file
t1 =  datetime.datetime.now()
IR_results = IRqueryByLeaders(leaderIndex, topic_clustering, query_vect )
t2 =  datetime.datetime.now()
t = t2-t1
print(IR_results)
print("running time:", t )

In [None]:
# test the IR on the whole query file
t1 =  datetime.datetime.now()
IR_results = IRqueryByLeaders(leaderIndex_10, topic_clustering_10, query_vect )
t2 =  datetime.datetime.now()
t = t2-t1
print(IR_results)
print("running time:", t )