In [None]:
import os
import re
from stemming.porter2 import stem
import glob
import string
import math
 
class Rcv1Doc:
    def __init__(self, docID): #initialization
        self.docID = docID
        self.terms = {}
        self.doc_len = 0
    def get_doc_id(self):
        return self.docID
    def add_term(self, term):
        stemmed_term = self.stem_term(term)
        if stemmed_term not in self.terms:
            self.terms[stemmed_term] = 1
        else:
            self.terms[stemmed_term] += 1
        self.doc_len += 1
    def stem_term(self, term):
        return stem(term)

def parse_rcv1v2(stop_words, inputpath):
    collection = {}
    os.chdir(inputpath)
    for file_ in glob.glob("*.xml"):  #iterates through all xml files in the path
        docID = None
        text = ""
        start_end = False
        for line in open(file_):        #reads each line of xml file 
            line = line.strip()
            if not start_end:
                if line.startswith("<newsitem "):
                    for part in line.split():
                        if part.startswith("itemid="):
                            docID = part.split("=")[1].split("\"")[1]       #gets docID
                            break  
                if line.startswith("<text>"):
                    start_end = True  
            elif line.startswith("</text>"):
                break
            else:
                line = line.replace("<p>", "").replace("</p>", "")
                line = line.translate(str.maketrans('', '', string.digits)).translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
                for term in line.split():
                    term = stem(term.lower())            #applying stemming and converting to lower case
                    if len(term) > 2 and term not in stop_words:
                        if docID:
                            if docID not in collection:
                                doc = Rcv1Doc(docID)
                                collection[docID] = doc
                            collection[docID].add_term(term)
                            text += term + " "
 
        if docID:
            print("Document ID:", docID)
            print("Term Frequencies:", collection[docID].terms)
 
    return collection
 
def parse_query(query0, stop_words):
    
    query_terms = {} # Initialize a dictionary to store term frequencies
    words = query0.translate(str.maketrans('', '', string.punctuation)).split()    # Remove punctuation characters from the query text and tokenize it
    for word in words:    # For loop to process each word from the query
        word = word.lower()
        if word not in stop_words and word.isalpha() and word.strip(): # Checks if the word is not a stop word, is alphabetic, and is not an empty string
            stemmed_word = stem(word)
            if stemmed_word in query_terms:
                query_terms[stemmed_word] += 1
            else:
                query_terms[stemmed_word] = 1
    return query_terms
 
def read_stop_words(file_path):
    with open(file_path, 'r') as file:
        stop_words = file.read().split(',')
    return stop_words
def my_df(coll):
    df = {}
    for doc in coll:
        terms = set(doc.keys())
        for term in terms:
            if term in df:
                df[term] += 1
            else:
                df[term] = 1
    return df
 
def my_tfidf(doc, df, ndocs):
    tfidf_weights = {}
    sum_of_squares = 0.0
    for term, freq in doc.items():
        fik = freq  # Term frequency in the document
        nk = df.get(term, 0)  # Document frequency of the term across all documents
        log_term = (fik + 1) * (ndocs / nk) if nk != 0 else 0
        tfidf_weights[term] = log_term
        sum_of_squares += log_term ** 2
 
    # Normalize TF*IDF weights
    if sum_of_squares > 0:
        normalization_factor = sum_of_squares ** 0.5
        for term in tfidf_weights:
            tfidf_weights[term] /= normalization_factor
 
    return tfidf_weights
 
def main():
    
    inputpath = r'C:\Users\0703s\Downloads'
    output_file = "TF_IDF.txt" 
 
    with open('common-english-words.txt', 'r') as stopwords_f:          # Load stop words
        stop_words = set(stopwords_f.read().split(','))
 
    Index = {}  # Initialize the index
    os.chdir(inputpath)
    for file_ in glob.glob("*.xml"):
        doc_terms = {}  # Initialize doc_terms for each document
        docid = None  # Initialize docid to None        
        start_end = False        #Flagging
        with open(file_) as xml_file:
            for line in xml_file:             # Iterate through the lines in the file
                line = line.strip()
                if not start_end:                  
                    if line.startswith("<newsitem "):                        
                        for part in line.split():    # Iterate through the parts of the line
                            if part.startswith("itemid="):        # Check if the part contains the itemid attribute                                
                                docid = part.split("=")[1].split("\"")[1]
                                break
                    if line.startswith("<text>"):          # Checks if the line contains the start of the text content
                        start_end = True
                elif line.startswith("</text>"):
                    break
               
                if start_end:               # If start tag has been encountered and end tag hasn't, process the line
                    line = line.replace("<p>", "").replace("</p>", "")
                    line = line.translate(str.maketrans('', '', string.digits)).translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
                    for term in line.split():
                        term = stem(term.lower())         # Tokenize and process terms in the text content
                        if len(term) > 2 and term not in stop_words:
                            doc_terms[term] = doc_terms.get(term, 0) + 1
 
        # Add doc_terms to Index with the corresponding docid
        if docid is not None:
            Index[docid] = doc_terms
 
 
    # Calculate document frequency (df) for the collection
    df = my_df(Index.values())  
    with open(output_file, 'w') as f:           # Write output to the text file
        f.write(f"There are {len(Index)} documents in this data set and contain {len(df)} terms.\n")
        f.write("The following are the terms’ document-frequency:\n")
        sorted_terms_df = sorted(df.items(), key=lambda x: x[1], reverse=True)
        for term, freq in sorted_terms_df:
            f.write(f"{term} : {freq}\n")
        f.write("\n")
        
        for docid, doc in Index.items():
            if len(doc) > 20:
                tfidf_weights = my_tfidf(doc, df, len(Index))          #Calculate TF-IDF weights
                sorted_terms_tfidf = sorted(tfidf_weights.items(), key=lambda x: x[1], reverse=True)[:20]   #Sort terms by TF-IDF weights in descending order
                f.write(f"Document {docid} contains {len(doc)} terms\n")
                for term, weight in sorted_terms_tfidf:
                    f.write(f"{term}: {weight}\n")
                f.write("\n")

    with open(output_file, 'a') as f:
        query_titles = ["ISRAEL: 15 Palestinians, two Israelis killed in clashes.",
                        "CANADA: Great-West Life tops Royal Bank bid for London Ins.",
                        "UK: Britain's Channel 5 to broadcast Fashion Awards."]  
        for title in query_titles:
            query = parse_query(title, stop_words)  # Ensure to pass stop_words
            ranking_result = {}
            for docid, doc in Index.items():
                tfidf_weights = my_tfidf(doc, df, len(Index))
                score = sum(tfidf_weights.get(term, 0) * query.get(term, 0) for term in query)
                ranking_result[docid] = score
            sorted_docs = sorted(ranking_result.items(), key=lambda x: x[1], reverse=True)       # Sort documents based on ranking score
            f.write(f"The Ranking Result for query: {title}\n")
            for docid, score in sorted_docs:
                f.write(f"{docid} : {score}\n")

                
if __name__ == "__main__":
    main()