In [None]:
import os
import re
from stemming.porter2 import stem
import glob
import string
import math
 
class Rcv1Doc:
    def __init__(self, docID): #initialization
        self.docID = docID
        self.terms = {}
        self.doc_len = 0
    def get_doc_id(self):
        return self.docID
    def add_term(self, term):
        stemmed_term = self.stem_term(term)
        if stemmed_term not in self.terms:
            self.terms[stemmed_term] = 1
        else:
            self.terms[stemmed_term] += 1
        self.doc_len += 1
    def stem_term(self, term):
        return stem(term)

def parse_rcv1v2(stop_words, inputpath):
    collection = {}
    os.chdir(inputpath)
    for file_ in glob.glob("*.xml"):  #iterates through all xml files in the path
        docID = None
        text = ""
        start_end = False
        for line in open(file_):        #reads each line of xml file 
            line = line.strip()
            if not start_end:
                if line.startswith("<newsitem "):
                    for part in line.split():
                        if part.startswith("itemid="):
                            docID = part.split("=")[1].split("\"")[1]       #gets docID
                            break  
                if line.startswith("<text>"):
                    start_end = True  
            elif line.startswith("</text>"):
                break
            else:
                line = line.replace("<p>", "").replace("</p>", "")
                line = line.translate(str.maketrans('', '', string.digits)).translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
                for term in line.split():
                    term = stem(term.lower())            #applying stemming and converting to lower case
                    if len(term) > 2 and term not in stop_words:
                        if docID:
                            if docID not in collection:
                                doc = Rcv1Doc(docID)
                                collection[docID] = doc
                            collection[docID].add_term(term)
                            text += term + " "
 
        if docID:
            print("Document ID:", docID)
            print("Term Frequencies:", collection[docID].terms)
 
    return collection
 
def parse_query(query0, stop_words):
    
    query_terms = {} # Initialize a dictionary to store term frequencies
    words = query0.translate(str.maketrans('', '', string.punctuation)).split()    # Remove punctuation characters from the query text and tokenize it
    for word in words:    # For loop to process each word from the query
        word = word.lower()
        if word not in stop_words and word.isalpha() and word.strip(): # Checks if the word is not a stop word, is alphabetic, and is not an empty string
            stemmed_word = stem(word)
            if stemmed_word in query_terms:
                query_terms[stemmed_word] += 1
            else:
                query_terms[stemmed_word] = 1
    return query_terms
 
def read_stop_words(file_path):
    with open(file_path, 'r') as file:
        stop_words = file.read().split(',')
    return stop_words 

def avg_length(coll):
    total_doc_length = sum(doc.doc_len for doc in coll)
    return total_doc_length / len(coll) if len(coll) > 0 else 0

def my_bm25(coll, q, df):
    bm25_scores = {}
    N = len(coll)
    avg_len = avg_length(coll)
    k1 = 1.2
    k2 = 100
    b = 0.75
    
    query_terms = parse_query(q, stop_words)
    
    for doc in coll:
        docid = doc.get_doc_id()
        fi = doc.doc_len
        score = 0.0
        
        for term in query_terms:
            if term in doc.terms:
                ni = df.get(term, 0)
                r_i = 0  # Assuming R = ri = 0
                R = 0
                # BM25 score calculation
                part1 = ((r_i+0.5)/(R-r_i+0.5))/((ni - r_i + 0.5) / (N - ni - r_i + 0.5))
                part2 = (k1 + 1) * fi / (k1 * ((1 - b) + b * (fi / avg_len)) + fi)
                part3 = (k2 + 1) * query_terms.get(term, 0) / (k2 + query_terms.get(term, 0))
                
                score += math.log(part1 * part2 * part3)      #using math library from python to calculate log
        
        bm25_scores[docid] = score
    
    return bm25_scores

def main():
    inputpath = r'C:\Users\0703s\OneDrive - Queensland University of Technology\QUT\IFN 647 TEXT WEB MEDIA ANALYTICS\Assignment 1\RCV1v2'
    output_file = "Sairam_Panneerselvam_Q3.txt"  
    
    Index = []
    os.chdir(inputpath)
    
    for file_path in glob.glob(os.path.join(inputpath, '*.xml')):
        with open(file_path, 'r', encoding='utf-8') as xml_file:
            xml_content = xml_file.read()
        
        docid = os.path.splitext(os.path.basename(file_path))[0]
        
        start_tag = '<text>'
        end_tag = '</text>'
        start_index = xml_content.find(start_tag)    # Extract text content from XML
        end_index = xml_content.find(end_tag)
        
        if start_index != -1 and end_index != -1:
            text_content = xml_content[start_index + len(start_tag):end_index]
            text_content = text_content.translate(str.maketrans('', '', string.digits)).translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
            terms = [term for term in text_content.split() if len(term) > 2]
            
            doc = Rcv1Doc(docid)
            for term in terms:
                doc.add_term(term)
            
            Index.append(doc)
    
    df = {}
    for doc in Index:
        for term in doc.terms:
            if term in df:
                df[term] += 1
            else:
                df[term] = 1
    
    queries = ["The British-Fashion Awards", "Rocket attacks", "Broadcast Fashion Awards", "stock market"]
    
    with open(output_file, 'w') as f:

        for query in queries:        # Ranking results for all queries
            print(f"The query is: {query}", file=f)
            print(f"The following are the BM25 score for each document:", file=f)
            bm25_scores = my_bm25(Index, query, df)
            for docid, score in bm25_scores.items():
                doc_len = next(doc.doc_len for doc in Index if doc.get_doc_id() == docid)
                print(f"Document ID: {docid}, Doc Length: {doc_len} -- BM25 Score: {score}", file=f)
            print("", file=f)  
        
        for query in queries:         # Top 6 results for each query
            print(f"For query '{query}', the top-6 possible relevant documents are:", file=f)
            bm25_scores = my_bm25(Index, query, df)
            sorted_docs_top6 = sorted(bm25_scores.items(), key=lambda x: x[1], reverse=True)[:6]
            for docid, score in sorted_docs_top6:
                doc_len = next(doc.doc_len for doc in Index if doc.get_doc_id() == docid)
                print(f"Document ID: {docid}, Doc Length: {doc_len} -- BM25 Score: {score}", file=f)
            print("", file=f)
            
if __name__ == "__main__":
    main()
