In [232]:
import pandas as pd
import numpy as np

In [233]:
df = pd.read_csv('./output.csv')
df.head()

Unnamed: 0,docno,text
0,CACM-2876,intentional resolution of privacy protection i...
1,CACM-0298,a 48-bit pseudo-random number generator a new ...
2,CACM-0236,soviet cybernetics and computer this article r...
3,CACM-2155,toward an understanding of data structures thi...
4,CACM-2191,algorithm 410 partial sorting m1 cacm may 1971...


In [234]:
# Get all document ids
doc_ids = df['docno'].unique()
doc_ids

array(['CACM-2876', 'CACM-0298', 'CACM-0236', ..., 'CACM-2772',
       'CACM-0833', 'CACM-0578'], dtype=object)

In [235]:
# Make a dictionary of document ids and document text
doc_dict = {}
for doc_id in doc_ids:
    doc_dict[doc_id] = df[df['docno'] == doc_id]['text'].values[0]
doc_dict

{'CACM-2876': 'intentional resolution of privacy protection in database systems traditionally privacy protection in database systems is understood to be the control over what information a given user can get from a database this paper is concerned with another independent dimension of privacy protection the control over what a user is allowed to do with a piece of information supplied to him by the database the ability to condition the supply of information on its intended use is called here intentional resolution of privacy protection the practical importance of intentional resolution is demonstrated by several examples and its realization is discussed it is shown that intentional resolution can be achieved but that it involves a radical change from the traditional approach to the process of user-database interaction in particular it appears to be necessary for the database to impose a certain amount of control over the internal behavior of users programs which interact with it a mode

In [236]:
import string
def remove_punctuation_and_numbers(input_text):
    exclude = set(string.punctuation + string.digits)
    cleaned_text = ''.join(char for char in input_text if char not in exclude)
    return cleaned_text

def remove_specialcharacters(text):
    no_special = "".join([c for c in text if c not in string.punctuation])
    return no_special

def clean_text(text):
    text = text.lower()
    text = remove_punctuation_and_numbers(text)
    text = remove_specialcharacters(text)
    return text

In [237]:
import csv
import pandas as pd 
import numpy as np

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer

In [238]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

df['text'] = df['text'].apply(lambda x: remove_stopwords(x.split()))
df['text']

0       intentional resolution privacy protection data...
1       48-bit pseudo-random number generator new 48-b...
2       soviet cybernetics computer article records ob...
3       toward understanding data structures paper pre...
4       algorithm 410 partial sorting m1 cacm may 1971...
                              ...                        
3199    trie memory cacm september 1960 fredkin e ca60...
3200    permanent function square matrix ii algorithm ...
3201    maintenance opportunity list class-teacher tim...
3202    reduction symmetric bandmatrix triple diagonal...
3203    generation partitions part-count form algorith...
Name: text, Length: 3204, dtype: object

In [239]:
# Tokenization 
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def tokenize(text):
    return word_tokenize(text)

tokenized_docs= df['text'].apply(lambda x: tokenize(x))
tokenized_docs

[nltk_data] Error loading punkt: <urlopen error [Errno 32] Broken
[nltk_data]     pipe>


0       [intentional, resolution, privacy, protection,...
1       [48-bit, pseudo-random, number, generator, new...
2       [soviet, cybernetics, computer, article, recor...
3       [toward, understanding, data, structures, pape...
4       [algorithm, 410, partial, sorting, m1, cacm, m...
                              ...                        
3199    [trie, memory, cacm, september, 1960, fredkin,...
3200    [permanent, function, square, matrix, ii, algo...
3201    [maintenance, opportunity, list, class-teacher...
3202    [reduction, symmetric, bandmatrix, triple, dia...
3203    [generation, partitions, part-count, form, alg...
Name: text, Length: 3204, dtype: object

In [240]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

lemmatized_text = df['text'][:1000].apply(lambda x: lemmatize_text(x))
lemmatized_text

[nltk_data] Error loading wordnet: Remote end closed connection
[nltk_data]     without response


0      i n t e n t i o n a l   r e s o l u t i o n   ...
1      4 8 - b i t   p s e u d o - r a n d o m   n u ...
2      s o v i e t   c y b e r n e t i c s   c o m p ...
3      t o w a r d   u n d e r s t a n d i n g   d a ...
4      a l g o r i t h m   4 1 0   p a r t i a l   s ...
                             ...                        
995    e x p o n e n t i a l   i n t e g r a l   a l ...
996    c o m p l e t e   e l l i p t i c   i n t e g ...
997    n o t e   t r i p l e - p r e c i s i o n   f ...
998    n u m e r i c a l   s o l u t i o n   n o n l ...
999    s e l f - s t a b i l i z i n g   s y s t e m ...
Name: text, Length: 1000, dtype: object

In [241]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_text(text):
    text = [stemmer.stem(word) for word in text]
    text = ' '.join(text)
    return text

stemmed_text = df['text'].apply(lambda x: stem_text(x))
stemmed_text


0       i n t e n t i o n a l   r e s o l u t i o n   ...
1       4 8 - b i t   p s e u d o - r a n d o m   n u ...
2       s o v i e t   c y b e r n e t i c s   c o m p ...
3       t o w a r d   u n d e r s t a n d i n g   d a ...
4       a l g o r i t h m   4 1 0   p a r t i a l   s ...
                              ...                        
3199    t r i e   m e m o r y   c a c m   s e p t e m ...
3200    p e r m a n e n t   f u n c t i o n   s q u a ...
3201    m a i n t e n a n c e   o p p o r t u n i t y ...
3202    r e d u c t i o n   s y m m e t r i c   b a n ...
3203    g e n e r a t i o n   p a r t i t i o n s   p ...
Name: text, Length: 3204, dtype: object

In [242]:
with open('./PSEUDO_RELEVANCE_FEEDBACK 2/query.txt', 'r') as f:
    queries = f.readlines()
queries = [query.strip() for query in queries]
queries

['what articles exist which deal with tss time sharing system  an operating system for ibm computers',
 'i am interested in articles written either by prieve or udo pooch prieve b pooch u',
 'intermediate languages used in construction of multi-targeted compilers tcoll',
 'i m interested in mechanisms for communicating between disjoint processes possibly but not exclusively in a distributed environment i would rather see descriptions of complete mechanisms with or without implementations as opposed to theoretical work on the abstract problem remote procedure calls and message-passing are examples of my interests',
 'i d like papers on design and implementation of editing interfaces window-managers command interpreters etc the essential issues are human interface design with views on improvements to user efficiency effectiveness and satisfaction',
 'interested in articles on robotics motion planning particularly the geometric and combinatorial aspects we are not interested in the dynami

In [243]:
# Convert the queries to a dataframe with 2 columns: query_id and query_text
query_df = pd.DataFrame()
query_df['query'] = [clean_text(query) for query in queries]
query_df['query_id'] = [i+1 for i in range(len(queries))]
query_df

Unnamed: 0,query,query_id
0,what articles exist which deal with tss time s...,1
1,i am interested in articles written either by ...,2
2,intermediate languages used in construction of...,3
3,i m interested in mechanisms for communicating...,4
4,i d like papers on design and implementation o...,5
...,...,...
59,hardware and software relating to database man...,60
60,information retrieval articles by gerard salto...,61
61,results relating parallel complexity theory bo...,62
62,algorithms for parallel computation and especi...,63


In [244]:
# Convert the cacm.rel.txt file to a dataframe with 3 columns: query_id, doc_id and relevance

with open('./PSEUDO_RELEVANCE_FEEDBACK 2/cacm.rel.txt', 'r') as f:
    rels = f.readlines()

rels = [rel.strip() for rel in rels]
rels
for i in range(len(rels)):
    rels[i] = rels[i].split()
rels

# Convert the rels list to a dataframe
rel_df = pd.DataFrame(rels)
rels_df = pd.DataFrame()
rels_df['query_id'] = rel_df[0]
rels_df['Q0'] = rel_df[1]
rels_df['doc_id'] = rel_df[2]
rels_df['relevance'] = rel_df[3]
rels_df

Unnamed: 0,query_id,Q0,doc_id,relevance
0,1,Q0,CACM-1410,1
1,1,Q0,CACM-1572,1
2,1,Q0,CACM-1605,1
3,1,Q0,CACM-2020,1
4,1,Q0,CACM-2358,1
...,...,...,...,...
791,63,Q0,CACM-2714,1
792,63,Q0,CACM-2973,1
793,63,Q0,CACM-3075,1
794,63,Q0,CACM-3156,1


In [245]:
def answer_passage_retrieval(query, documents, top_n):
    query = clean_text(query)
    documents['text'] = documents['text'].apply(clean_text)
    tokenized_corpus = [doc.split(" ") for doc in documents['text']]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    doc_scores = list(enumerate(doc_scores))
    doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True)
    doc_scores = doc_scores[:top_n]
    print(doc_scores)
    doc_ids = [str(doc_score[0]) for doc_score in doc_scores]
    doc_scores = [str(doc_score[1]) for doc_score in doc_scores]
    return doc_ids, doc_scores

In [246]:
answer_passage_retrieval('where are wind power installations located', df, 10)

[(2424, 12.726876963153472), (556, 7.606404129379686), (81, 7.065989216970452), (3033, 6.771587886080988), (855, 6.647375647758653), (1234, 6.585470082300587), (818, 6.48910101262403), (1186, 6.354541723869781), (2983, 6.354541723869781), (1425, 6.310920225030784)]


(['2424', '556', '81', '3033', '855', '1234', '818', '1186', '2983', '1425'],
 ['12.726876963153472',
  '7.606404129379686',
  '7.065989216970452',
  '6.771587886080988',
  '6.647375647758653',
  '6.585470082300587',
  '6.48910101262403',
  '6.354541723869781',
  '6.354541723869781',
  '6.310920225030784'])

In [247]:
doc_ids_list = []
doc_scores_list = []
for query in queries:
    doc_ids, doc_scores = answer_passage_retrieval(query, df, 10)
    doc_ids_list.append(doc_ids)
    doc_scores_list.append(doc_scores)

[(2077, 22.539189585537244), (2821, 21.699833941219886), (2564, 20.823839541533356), (1758, 20.272586233494053), (944, 20.225483093872636), (975, 20.06630788607871), (536, 19.84906366629255), (3103, 18.97144785358931), (2362, 18.634185623287724), (442, 18.28373097017426)]
[(2505, 19.726112676123048), (2918, 13.692514708384126), (2584, 10.09313914087927), (846, 9.635302714279678), (1010, 9.568888560770173), (1929, 9.148160060918242), (3194, 9.11595393338394), (1262, 9.058863090951894), (1201, 8.53804831775673), (2988, 8.326675386243068)]
[(1669, 8.77701740548292), (2005, 8.664380215076804), (2989, 8.541198525114762), (795, 8.506367934281576), (1098, 7.932638230748457), (2664, 7.915953428314216), (261, 7.8154635830054815), (1581, 7.743316742479998), (2623, 7.67192532446228), (659, 7.320863020480486)]
[(593, 27.171137473229926), (2479, 23.057851415428544), (3042, 22.5088970526574), (1169, 20.39614706417374), (1899, 20.29773836261787), (2049, 19.834960128256597), (3194, 18.23190786676788),

In [248]:
# Create a dataframe with 2 columns: query_id and doc_id
doc_ids_df = pd.DataFrame()
doc_ids_df['query_id'] = query_df['query_id']
doc_ids_df['doc_ids'] = doc_ids_list
doc_ids_df

Unnamed: 0,query_id,doc_ids
0,1,"[2077, 2821, 2564, 1758, 944, 975, 536, 3103, ..."
1,2,"[2505, 2918, 2584, 846, 1010, 1929, 3194, 1262..."
2,3,"[1669, 2005, 2989, 795, 1098, 2664, 261, 1581,..."
3,4,"[593, 2479, 3042, 1169, 1899, 2049, 3194, 1262..."
4,5,"[2122, 2964, 968, 1996, 1884, 1211, 1881, 975,..."
...,...,...
59,60,"[1713, 1400, 2746, 2098, 1431, 2409, 831, 601,..."
60,61,"[2522, 239, 2549, 1705, 1460, 861, 1393, 2945,..."
61,62,"[3130, 2237, 1159, 725, 646, 2465, 568, 630, 2..."
62,63,"[682, 2849, 567, 182, 2727, 1884, 70, 630, 193..."


In [249]:
# Convert the rels_df dataframe by removing the Q0 column and grouping by query_id
rels_df = rels_df.drop(columns=['Q0'])
rels_df = rels_df.groupby('query_id').agg(list)
rels_df.sort_values(by=['query_id'], inplace=True)
rels_df

Unnamed: 0_level_0,doc_id,relevance
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[CACM-1410, CACM-1572, CACM-1605, CACM-2020, C...","[1, 1, 1, 1, 1]"
10,"[CACM-46, CACM-141, CACM-392, CACM-950, CACM-1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
11,"[CACM-1043, CACM-1188, CACM-1306, CACM-1358, C...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
12,"[CACM-1523, CACM-2080, CACM-2246, CACM-2629, C...","[1, 1, 1, 1, 1]"
13,"[CACM-115, CACM-1223, CACM-1231, CACM-1551, CA...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
14,"[CACM-74, CACM-117, CACM-232, CACM-776, CACM-8...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15,"[CACM-1231, CACM-1551, CACM-1613, CACM-1947, C...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
16,"[CACM-1746, CACM-1749, CACM-1828, CACM-1854, C...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
17,"[CACM-115, CACM-405, CACM-1134, CACM-1223, CAC...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
18,"[CACM-1158, CACM-1215, CACM-1262, CACM-1471, C...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [250]:
# Total number of relevant documents for each query
rels_df['relevant_docs_count'] = rels_df['relevance'].apply(lambda x: len(x))
# Drop the relevance column
rels_df = rels_df.drop(columns=['relevance'])
rels_df

Unnamed: 0_level_0,doc_id,relevant_docs_count
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[CACM-1410, CACM-1572, CACM-1605, CACM-2020, C...",5
10,"[CACM-46, CACM-141, CACM-392, CACM-950, CACM-1...",35
11,"[CACM-1043, CACM-1188, CACM-1306, CACM-1358, C...",19
12,"[CACM-1523, CACM-2080, CACM-2246, CACM-2629, C...",5
13,"[CACM-115, CACM-1223, CACM-1231, CACM-1551, CA...",11
14,"[CACM-74, CACM-117, CACM-232, CACM-776, CACM-8...",44
15,"[CACM-1231, CACM-1551, CACM-1613, CACM-1947, C...",10
16,"[CACM-1746, CACM-1749, CACM-1828, CACM-1854, C...",17
17,"[CACM-115, CACM-405, CACM-1134, CACM-1223, CAC...",16
18,"[CACM-1158, CACM-1215, CACM-1262, CACM-1471, C...",11


In [251]:
# Run the answer_passage_retrieval function for each query and store the results in a dataframe
for i in range(len(query_df)):
    query = query_df['query'][i]
    doc_ids, doc_scores = answer_passage_retrieval(query, df, 10)
    doc_ids_df['doc_ids'][i] = doc_ids
doc_ids_df


[(2077, 22.539189585537244), (2821, 21.699833941219886), (2564, 20.823839541533356), (1758, 20.272586233494053), (944, 20.225483093872636), (975, 20.06630788607871), (536, 19.84906366629255), (3103, 18.97144785358931), (2362, 18.634185623287724), (442, 18.28373097017426)]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doc_ids_df['doc_ids'][i] = doc_ids


[(2505, 19.726112676123048), (2918, 13.692514708384126), (2584, 10.09313914087927), (846, 9.635302714279678), (1010, 9.568888560770173), (1929, 9.148160060918242), (3194, 9.11595393338394), (1262, 9.058863090951894), (1201, 8.53804831775673), (2988, 8.326675386243068)]
[(1669, 8.77701740548292), (2005, 8.664380215076804), (2989, 8.541198525114762), (795, 8.506367934281576), (1098, 7.932638230748457), (2664, 7.915953428314216), (261, 7.8154635830054815), (1581, 7.743316742479998), (2623, 7.67192532446228), (659, 7.320863020480486)]
[(593, 27.171137473229926), (2479, 23.057851415428544), (3042, 22.5088970526574), (1169, 20.39614706417374), (1899, 20.29773836261787), (2049, 19.834960128256597), (3194, 18.23190786676788), (1262, 18.117726181903787), (2993, 17.876854114606584), (3056, 17.871436573717585)]
[(2122, 26.58621178451237), (2964, 25.602101117797645), (968, 22.96114364073947), (1996, 19.374587086820263), (1884, 18.369521645943692), (1211, 18.3271221062949), (1881, 18.21040468372555

Unnamed: 0,query_id,doc_ids
0,1,"[2077, 2821, 2564, 1758, 944, 975, 536, 3103, ..."
1,2,"[2505, 2918, 2584, 846, 1010, 1929, 3194, 1262..."
2,3,"[1669, 2005, 2989, 795, 1098, 2664, 261, 1581,..."
3,4,"[593, 2479, 3042, 1169, 1899, 2049, 3194, 1262..."
4,5,"[2122, 2964, 968, 1996, 1884, 1211, 1881, 975,..."
...,...,...
59,60,"[1713, 1400, 2746, 2098, 1431, 2409, 831, 601,..."
60,61,"[2522, 239, 2549, 1705, 1460, 861, 1393, 2945,..."
61,62,"[3130, 2237, 1159, 725, 646, 2465, 568, 630, 2..."
62,63,"[682, 2849, 567, 182, 2727, 1884, 70, 630, 193..."


In [252]:
rel_doc_ids=[]
for i in range(len(rels_df)):
    # Strip the CACM- prefix from the doc_id and append
    rel_doc_ids.append([doc_id[6:] for doc_id in rels_df['doc_id'][i]])
rel_doc_ids
rels_df['doc_id'] = rel_doc_ids
rels_df

Unnamed: 0_level_0,doc_id,relevant_docs_count
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[410, 572, 605, 020, 358]",5
10,"[6, 41, 92, 50, 158, 198, 262, 380, 471, 601, ...",35
11,"[043, 188, 306, 358, 396, 491, 923, 246, 316, ...",19
12,"[523, 080, 246, 629, 127]",5
13,"[15, 223, 231, 551, 625, 795, 807, 947, 495, 5...",11
14,"[4, 17, 32, 76, 27, 50, 51, 52, 54, 55, 56, 57...",44
15,"[231, 551, 613, 947, 263, 495, 598, 685, 701, ...",10
16,"[746, 749, 828, 854, 960, 070, 114, 342, 376, ...",17
17,"[15, 05, 134, 223, 231, 535, 551, 613, 807, 93...",16
18,"[158, 215, 262, 471, 613, 811, 060, 175, 413, ...",11


In [253]:
# Convert the groupby object to a dataframe
rels_df = rels_df.reset_index()
rels_df


Unnamed: 0,query_id,doc_id,relevant_docs_count
0,1,"[410, 572, 605, 020, 358]",5
1,10,"[6, 41, 92, 50, 158, 198, 262, 380, 471, 601, ...",35
2,11,"[043, 188, 306, 358, 396, 491, 923, 246, 316, ...",19
3,12,"[523, 080, 246, 629, 127]",5
4,13,"[15, 223, 231, 551, 625, 795, 807, 947, 495, 5...",11
5,14,"[4, 17, 32, 76, 27, 50, 51, 52, 54, 55, 56, 57...",44
6,15,"[231, 551, 613, 947, 263, 495, 598, 685, 701, ...",10
7,16,"[746, 749, 828, 854, 960, 070, 114, 342, 376, ...",17
8,17,"[15, 05, 134, 223, 231, 535, 551, 613, 807, 93...",16
9,18,"[158, 215, 262, 471, 613, 811, 060, 175, 413, ...",11


In [254]:
rels_df

Unnamed: 0,query_id,doc_id,relevant_docs_count
0,1,"[410, 572, 605, 020, 358]",5
1,10,"[6, 41, 92, 50, 158, 198, 262, 380, 471, 601, ...",35
2,11,"[043, 188, 306, 358, 396, 491, 923, 246, 316, ...",19
3,12,"[523, 080, 246, 629, 127]",5
4,13,"[15, 223, 231, 551, 625, 795, 807, 947, 495, 5...",11
5,14,"[4, 17, 32, 76, 27, 50, 51, 52, 54, 55, 56, 57...",44
6,15,"[231, 551, 613, 947, 263, 495, 598, 685, 701, ...",10
7,16,"[746, 749, 828, 854, 960, 070, 114, 342, 376, ...",17
8,17,"[15, 05, 134, 223, 231, 535, 551, 613, 807, 93...",16
9,18,"[158, 215, 262, 471, 613, 811, 060, 175, 413, ...",11


In [255]:
doc_ids_df

Unnamed: 0,query_id,doc_ids
0,1,"[2077, 2821, 2564, 1758, 944, 975, 536, 3103, ..."
1,2,"[2505, 2918, 2584, 846, 1010, 1929, 3194, 1262..."
2,3,"[1669, 2005, 2989, 795, 1098, 2664, 261, 1581,..."
3,4,"[593, 2479, 3042, 1169, 1899, 2049, 3194, 1262..."
4,5,"[2122, 2964, 968, 1996, 1884, 1211, 1881, 975,..."
...,...,...
59,60,"[1713, 1400, 2746, 2098, 1431, 2409, 831, 601,..."
60,61,"[2522, 239, 2549, 1705, 1460, 861, 1393, 2945,..."
61,62,"[3130, 2237, 1159, 725, 646, 2465, 568, 630, 2..."
62,63,"[682, 2849, 567, 182, 2727, 1884, 70, 630, 193..."


In [256]:
# Calculate the precision for each query
def calculate_precision(query_id):
    doc_ids = doc_ids_df[doc_ids_df['query_id'] == query_id]['doc_ids'].values[0]
    rel_doc_ids = rels_df[query_id]['doc_id'].values[0]
    precision = len(set(doc_ids).intersection(set(rel_doc_ids))) / len(doc_ids)
    return precision

In [257]:
doc_ids = doc_ids_df[doc_ids_df['query_id'] == 1]['doc_ids'].values[0]
doc_ids

['2077', '2821', '2564', '1758', '944', '975', '536', '3103', '2362', '442']

In [258]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
def answer_passage_retrieval_ql(query, documents, top_n):
    query = clean_text(query)
    documents['text'] = documents['text'].apply(clean_text)
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(documents['text']))
    tfidf = tfidf.toarray()
    query_vec = vectorizer.transform([query]).toarray()
    scores = np.dot(query_vec, tfidf.T)
    scores = scores[0]
    scores = list(enumerate(scores))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[:top_n]
    doc_ids = [str(score[0]) for score in scores]
    scores = [str(score[1]) for score in scores]
    return doc_ids, scores


In [259]:
tf_idf_scores=[]
tf_idf_docs_ids=[]
for query in queries:
    doc_ids, doc_scores = answer_passage_retrieval_ql(query, df, 10)
    tf_idf_scores.append(doc_scores)
    tf_idf_docs_ids.append(doc_ids)

# Create a dataframe with 2 columns: query_id and doc_id
tf_idf_doc_ids_df = pd.DataFrame()
tf_idf_doc_ids_df['query_id'] = query_df['query_id']
tf_idf_doc_ids_df['doc_ids'] = tf_idf_docs_ids
tf_idf_doc_ids_df

Unnamed: 0,query_id,doc_ids
0,1,"[1974, 2507, 1906, 2077, 3103, 536, 621, 2779,..."
1,2,"[1780, 2918, 2505, 1929, 2887, 1767, 2077, 136..."
2,3,"[1451, 1174, 956, 785, 1785, 2417, 2117, 2813,..."
3,4,"[593, 335, 3056, 2862, 3042, 792, 1711, 34, 20..."
4,5,"[1665, 2964, 1211, 1996, 1881, 1709, 2216, 240..."
...,...,...
59,60,"[1713, 1400, 2746, 2085, 2812, 831, 145, 1431,..."
60,61,"[192, 1705, 2522, 1488, 1502, 1393, 2378, 239,..."
61,62,"[3130, 2237, 725, 1159, 2021, 364, 196, 1352, ..."
62,63,"[700, 2849, 182, 2289, 1352, 1933, 70, 1267, 2..."


In [260]:
def calculate_average_precision(query, documents, top_n):
    doc_ids, doc_scores = answer_passage_retrieval(query, documents, top_n)
    relevant_docs = pd.read_csv('./PSEUDO_RELEVANCE_FEEDBACK 2/relevant_docs.csv')
    relevant_docs = relevant_docs[relevant_docs['query'] == query]
    relevant_docs = relevant_docs['doc_id'].values
    print(relevant_docs)
    precision = []
    for i in range(1, top_n+1):
        if doc_ids[i-1] in relevant_docs:
            precision.append(len(precision)/(i))
    return np.mean(precision)