In [272]:
# jupyter nbconvert --to script assignment1Ander.ipynb


In [273]:
!pip install pandas nltk scikit-learn



In [274]:
import pandas as pd
from typing import Tuple, List, Dict, Union
import nltk

In [275]:
dtypes = {
    'ArticleId': 'int32',
    'Text': 'str',
    'Category': 'category'
}
data_train = pd.read_csv('data/BBC News Train.csv', dtype=dtypes, encoding='utf-8')
data_test = pd.read_csv('data/BBC News Test.csv', dtype={'ArticleId': 'int32', 'Text': 'str'}, encoding='utf-8')
data_test_solution = pd.read_csv('data/BBC News Sample Solution.csv', dtype={'ArticleId': 'int32', 'Category': 'category'}, encoding='utf-8')
data_test['Category'] = data_test_solution['Category']
data = pd.concat([data_train, data_test])
data.head(10)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
5,1582,howard truanted to play snooker conservative...,politics
6,651,wales silent on grand slam talk rhys williams ...,sport
7,1797,french honour for director parker british film...,entertainment
8,2034,car giant hit by mercedes slump a slump in pro...,business
9,1866,fockers fuel festive film chart comedy meet th...,entertainment


In [276]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2225 entries, 0 to 734
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   ArticleId  2225 non-null   int32   
 1   Text       2225 non-null   object  
 2   Category   2225 non-null   category
dtypes: category(1), int32(1), object(1)
memory usage: 45.8+ KB


In [277]:
data.Category.value_counts()

Category
sport            493
business         483
politics         421
entertainment    420
tech             408
Name: count, dtype: int64

## Preprocessing

### Lowercasing

In [278]:
def lowercase_data(data: pd.DataFrame) -> pd.DataFrame:
    new_data = data.copy()
    new_data['Text'] = new_data['Text'].str.lower()
    return new_data

### Punctuation Removal

In [279]:
from nltk.tokenize import RegexpTokenizer

def remove_punctuation(data: pd.DataFrame) -> pd.DataFrame:
    tokenizer = RegexpTokenizer(r'[a-z]+')
    new_data = data.copy()
    new_data['Text'] = new_data['Text'].apply(tokenizer.tokenize)
    return new_data

### Stopwords Removal

In [280]:
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(data: pd.DataFrame) -> pd.DataFrame:
    stop_words = set(stopwords.words('english'))
    new_data = data.copy()
    new_data['Text'] = new_data['Text'].apply(lambda x: [word for word in tuple(x) if word not in stop_words and len(word) > 3])
    return new_data

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


### Lemmatization

In [281]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
def lemmatize_data(data: pd.DataFrame) -> Tuple[pd.DataFrame, WordNetLemmatizer]:
    lemmatizer = WordNetLemmatizer()
    new_data = data.copy()
    new_data['Text'] = new_data['Text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))
    return new_data, lemmatizer

[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


### Stemming

In [282]:
from nltk.stem import PorterStemmer

nltk.download('punkt')
def stem_data(data: pd.DataFrame) -> Tuple[pd.DataFrame, PorterStemmer]:
    stemmer = PorterStemmer()
    new_data = data.copy()
    new_data['Text'] = new_data['Text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x]))
    return new_data, stemmer

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


### Compute tf-idf

In [283]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tfidf(data: pd.DataFrame, *, min_df: float = 0.05, max_df: float = 0.1, stop_words_language: str = 'english', max_features: int = 100) -> Tuple[TfidfVectorizer, pd.DataFrame]:
    # Drop rows with NaN values in the 'Text' column
    data = data.dropna(subset=['Text'])
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df,  stop_words=stop_words_language, max_features=max_features)
    
    X = vectorizer.fit_transform(data['Text'])
    return vectorizer, X

In [284]:
def data_preprocessing(data: Union[pd.DataFrame, List]) -> pd.DataFrame:
    if isinstance(data, list):
        data = pd.DataFrame(data, columns=['Text'])
    data = lowercase_data(data)
    data = remove_punctuation(data)
    data = remove_stopwords(data)
    return data

In [285]:
def lemmatize_or_stem_data(data: pd.DataFrame, lemmatize: bool = True) -> Tuple[pd.DataFrame, WordNetLemmatizer]:
    if lemmatize:
        return lemmatize_data(data)
    return stem_data(data)

In [286]:
data = data_preprocessing(data)
data, lem_stem_vectorizer = lemmatize_or_stem_data(data, lemmatize=True)
# data_stemmed = stem_data(data)

In [287]:
#user_interests = {
#     1: ['politics', 'soccer'],
#     2: ['music', 'films'],
#     3: ['cars', 'politics'],
#     4: ['soccer']
# }

In [288]:
user_interests = {
    1: ['politics'],
    2: ['entertainment'],
    3: ['sport'],
    4: ['tech'],
    5: ['business']
}

In [289]:
from sklearn.metrics.pairwise import cosine_similarity

vectorizer, document_vectors = get_tfidf(data, min_df=0.0, max_df=0.05, stop_words_language='english', max_features=1000)

In [290]:
def flatten(xss):
    return [x for xs in xss for x in xs]

def preprocess_user_interests(user_interests: Dict[int, List[str]]) -> Dict[int, List[str]]:
    interests_processed = {}
    for user_id, interests in user_interests.items():
         interests_processed[user_id] = flatten(data_preprocessing(interests)['Text'].to_list())
    return interests_processed

In [291]:
num_documents_to_retrieve = 10  # Number of documents to retrieve for each user
user_relevant_documents: Dict[int, List[Tuple[int, float]]] = {}
categories = data['Category'].unique()

for user_id, interests in preprocess_user_interests(user_interests).items():
    relevant_documents: List[Tuple[int, float]] = []
    for interest in interests:
        interest2 = lem_stem_vectorizer.lemmatize(interest)
        query_vector = vectorizer.transform([interest2])
        similarity_scores = cosine_similarity(query_vector, document_vectors)[0]
        top_documents_indices = similarity_scores.argsort()[-num_documents_to_retrieve:][::-1]
        top_documents: List[Tuple[int, float]] = [(data.iloc[i]['ArticleId'], similarity_scores[i]) for i in top_documents_indices]
        relevant_documents.extend(top_documents)
    
    # Sort the relevant documents by their similarity scores
    relevant_documents.sort(key=lambda x: x[1], reverse=True)
    
    user_relevant_documents[user_id] = relevant_documents

In [292]:
user_relevant_documents

{1: [(1792, 0.7408856831837601),
  (642, 0.5592710769276793),
  (825, 0.4823601107731745),
  (553, 0.45191177920047976),
  (613, 0.3417742921961316),
  (2211, 0.31446432399649277),
  (14, 0.31098090160057457),
  (86, 0.31098090160057457),
  (882, 0.2891740022375368),
  (1104, 0.2709810314485538)],
 2: [(985, 0.6096620994463217),
  (84, 0.41090477570057116),
  (1147, 0.38513858369268944),
  (326, 0.32485876883885145),
  (1090, 0.3076501360110294),
  (716, 0.29229709963955747),
  (332, 0.23938478253891698),
  (992, 0.23801186843872937),
  (1841, 0.23024466644340688),
  (1821, 0.2275906502547173)],
 3: [(471, 0.0),
  (1280, 0.0),
  (1555, 0.0),
  (1234, 0.0),
  (1874, 0.0),
  (231, 0.0),
  (1995, 0.0),
  (1994, 0.0),
  (1860, 0.0),
  (245, 0.0)],
 4: [(1174, 0.4188148218582689),
  (1805, 0.3862573014139449),
  (2218, 0.25303704426275203),
  (645, 0.25303704426275203),
  (1241, 0.25035470952674327),
  (626, 0.24638118212209764),
  (631, 0.23318543779605272),
  (2168, 0.20515731929925923),


solucionar esto, devuelve lo mismo para sport y para business

In [293]:
user_relevant_documents[5] == user_relevant_documents[3]

True

In [294]:
categories

['business', 'tech', 'politics', 'sport', 'entertainment']
Categories (5, object): ['business', 'entertainment', 'politics', 'sport', 'tech']

User 1 Interests: ['politics']
     Document: court halt mark morrison album Similarity Score: 0.5719763
     Document: confusion high definition crit Similarity Score: 0.55491936
     Document: ethiopia crop production ethio Similarity Score: 0.5127802
     Document: detention ruling urged governm Similarity Score: 0.5117885
     Document: santy worm make unwelcome visi Similarity Score: 0.50219214
     Document: ukip candidate suspended euros Similarity Score: 0.5007821
     Document: bosvelt optimistic deal manche Similarity Score: 0.49390337
     Document: adriano chelsea link rejected  Similarity Score: 0.49390337
     Document: copy protection strengthened d Similarity Score: 0.49235088
     Document: italy ireland moment magic bri Similarity Score: 0.481332

In [297]:
# Print or use user_relevant_documents as needed
for user_id, relevant_documents in user_relevant_documents.items():
    print(f"User {user_id} relevant documents:")
    for i, (document_id, document_text, similarity_score) in enumerate(relevant_documents, start=1):
        print(f"{i}. Document ID: {document_id}\nDocument Text: {document_text}\nSimilarity Score: {similarity_score:.4f}\n")

User 1 relevant documents:
1. Document ID: 1792

Similarity Score: 0.7409

2. Document ID: 642

Similarity Score: 0.5593

3. Document ID: 825

Similarity Score: 0.4824

4. Document ID: 553

Similarity Score: 0.4519

5. Document ID: 613

Similarity Score: 0.3418

6. Document ID: 2211

Similarity Score: 0.3145

7. Document ID: 14

Similarity Score: 0.3110

8. Document ID: 86

Similarity Score: 0.3110

9. Document ID: 882

Similarity Score: 0.2892

10. Document ID: 1104

Similarity Score: 0.2710

User 2 relevant documents:
1. Document ID: 985

Similarity Score: 0.6097

2. Document ID: 84

Similarity Score: 0.4109

3. Document ID: 1147

Similarity Score: 0.3851

4. Document ID: 326

Similarity Score: 0.3249

5. Document ID: 1090

Similarity Score: 0.3077

6. Document ID: 716

Similarity Score: 0.2923

7. Document ID: 332

Similarity Score: 0.2394

8. Document ID: 992

Similarity Score: 0.2380

9. Document ID: 1841

Similarity Score: 0.2302

10. Document ID: 1821

Similarity Score: 0.2276



In [None]:
# Print or use user_relevant_documents as needed
for user_id, relevant_documents in user_relevant_documents.items():
    print(f"User {user_id} relevant documents:")
    for i, (document_id, document_text, similarity_score) in enumerate(relevant_documents, 1):
        print(f"{i}. Document ID: {document_id}\nDocument Text: {document_text}\nSimilarity Score: {similarity_score:.4f}\n")

In [None]:
# from evaltools import evaluate

evaluar cin la funcion del profesor

In [None]:
import numpy as np
# from evaltools import evaluate, generate_relevance_array
Q = []
R = []
def get_category(doc_id):
    return data.loc[data['ArticleId'] == doc_id, 'Category'].values[0]

for user_id, interests in user_interests.items():
    print(user_id)
    print(interests)
    # Transform the user's interests into a query vector
    article_ids = [doc_id for doc_id, _, _ in user_relevant_documents[user_id]]
    print(article_ids)
    Q.append(article_ids)
    
    # Extract the ids of the relevant documents for the user
    R.append([1 if any(interest == get_category(doc_id) for interest in interests) else -1 for doc_id, _, _ in user_relevant_documents[user_id]])
    # R.append([1 if interests[0] == get_category(doc_id) else -1 for doc_id, _, _ in user_relevant_documents[user_id]])

In [None]:
for i in range(len(Q)):
    print(len(Q[i]), len(R[i]))
    # IF ANY VALUE IS NAN, THE TEST WILL FAIL
    if len(Q[i]) != len(R[i]):
        print(f"Mismatch in lengths at index {i}: len(Q[{i}]) = {len(Q[i])}, len(R[{i}]) = {len(R[i])}")

In [None]:
def evaluate(ex,Q,R):
    nq=len(Q)
    nd=len(Q[0])
    R_=np.array(R)
    R_=.5*(R_+1)
    Prec_tot=[]
    Rec_tot=[]    
    
    def compute_PR(print_screen=True):
        Prec_tot=[]
        Rec_tot=[]        
        if print_screen:
            print('Precision and Recall at k for k=1,...,%d' % nd)      
        for q in range(nq):
            q1 = q + 1
            r = R_[q,:]
            if print_screen:
                print('\tQuery %d' % q1)
            Prec_q=[]
            Rec_q=[]
            for k in range(nd):
                k1 = k + 1
                Prec=np.sum(r[:k1])/k1
                Rec=np.sum(r[:k1])/np.sum(r)    
                # if np.sum(r) == 0:
                #     Prec = 0
                #     Rec = 0
                # else:
                #     Prec = np.sum(r[:k1]) / k1
                #     Rec = np.sum(r[:k1]) / np.sum(r)
                if print_screen:                    
                    print('\t\tP(%d)=%d/%d=%.2f,\tR(%d)=%d/%d=%.2f' % (k1, np.sum(r[:k1]), k1, Prec, k1, np.sum(r[:k1]), np.sum(r), Rec))
                Prec_q.append(Prec)
                Rec_q.append(Rec)
            Prec_tot.append(Prec_q)
            Rec_tot.append(Rec_q)
        Prec_tot = np.array(Prec_tot)
        Rec_tot = np.array(Rec_tot)
        return Prec_tot, Rec_tot

    def compute_TPFP(TP_rate=None):
        TP_tot=[]        
        FP_tot=[]        
        print('TP_rate and FP_rate at k for k=1,...,%d'%nd)      
        for q in range(nq):
            q1=q+1
            r=R_[q,:]
            nr=1-r
            print('\tQuery %d'%q1)
            TP_q=[]
            FP_q=[]
            for k in range(nd):
                k1=k+1
                TP=np.sum(r[:k1])/np.sum(r)                
                FP=np.sum(nr[:k1])/np.sum(nr)
                
                print('\t\tTP_rate(%d)=R(%d)=%d/%d=%.2f\t FP_rate(%d)=%d/%d=%.2f\t'\
                     %(k1, k1, np.sum(r[:k1]),np.sum(r),TP, k1,np.sum(nr[:k1]),np.sum(nr),FP))
                TP_q.append(TP)
                FP_q.append(FP)
            TP_tot.append(TP_q)
            FP_tot.append(FP_q)
        TP_tot=np.array(TP_tot)
        FP_tot=np.array(FP_tot)
        return TP_tot, FP_tot        
    if ex=='prec_rec' or ex=='all':        
        Prec_tot, Rec_tot=compute_PR()
        print('\n Draw the Precision-Recall curve for each query')  
        # print(f"Precision: {Prec_tot}, Recall: {Rec_tot}")
        for q in range(nq):
            q1=q+1
            print('\tQuery %d'%q1)            
            plt.figure()
            Rec_q=Rec_tot[q,:]
            Prec_q=Prec_tot[q,:]
            # print(f"Rec_q: {Rec_q}, rec_q: {Prec_q}")
            plt.scatter(np.array(Rec_q), np.array(Prec_q))
            plt.plot(np.array(Rec_q), np.array(Prec_q),label='Precision-Recall curve')            
            plt.xlim([-0.05,1.05]); plt.ylim([-0.05,1.05])
            plt.xlabel('Recall'); plt.ylabel('Precision')
            R_int=np.hstack([0,Rec_q,1])
            # print(f"R_int: {R_int}")
            P_int=np.zeros(R_int.size)
            # print(f"P_int: {P_int}")
            for i_r in range(R_int.size-1):
                r=R_int[i_r]
                if i_r!=0 and R_int[i_r+1]==r:
                    P_int[i_r]=np.max(Prec_q[i_r-1:])    
                else:
                    P_int[i_r]=np.max(Prec_q[i_r:])            
            plt.plot(R_int,P_int,color='r',label='Interpolated PR curve')
            plt.legend(loc='lower left')
            plt.show()
    if ex=='r-prec' or ex=='all':        
        if Prec_tot.size == 0:
            Prec_tot, Rec_tot=compute_PR()
        print('\n Determine R-precision for each query') 
        for q in range(nq):            
            Rec_q=Rec_tot[q,:]
            Prec_q=Prec_tot[q,:]
            r=int(np.sum(R_[q]))
            q1=q+1
            print('\tQuery %d'%q1)
            print('\t\tNumber of relevant documents: %d --> P(%d)=%.2f'%(r,r,Prec_q[r-1]))
    if ex=='map' or ex=='all':        
        if Prec_tot.size == 0:
            Prec_tot, Rec_tot=compute_PR()
        print('\n Calculate the Mean Average Precision')
        APs=[]
        for q in range(nq):            
            Prec_q=Prec_tot[q,:]            
            r=int(np.sum(R_[q]))
            q1=q+1
            print('\tQuery %d'%q1)
            str_formula='1/%d '%r
            rs=np.where(R_[q]==1)[0]+1
            str_formula+='{' + ' + '.join(['P(%d)'%rs_ for rs_ in rs]) + '}'
            AP=np.mean(Prec_q[np.where(R_[q]==1)])            
            print('\t\tAP=%s=%.2f'%(str_formula, AP))
            APs.append(AP)
        APstring='1/%d {'%nq
        APstring+= ' + '.join(['AP_%d'%(q+1) for q in range(nq)]) 
        APstring+='}=1/%d {'%nq
        APstring+= ' + '.join(['%.2f'%(AP) for AP in APs]) 
        APstring+='}'        
        print('\tMAP=%s=%.2f'%(APstring, np.mean(np.array(APs))))
    if ex=='roc' or ex=='all' or ex=='auc':
        TP_tot, FP_tot=compute_TPFP()    
        print('\n Draw the ROC curve for each query')  
        for q in range(nq):
            q1=q+1
            print('\tQuery %d'%q1)            
            plt.figure()
            TP_q=TP_tot[q,:]
            FP_q=FP_tot[q,:]
            plt.scatter(np.array(FP_q), np.array(TP_q))
            TP_q_=np.hstack([0,TP_q,1])
            FP_q_=np.hstack([0,FP_q,1])
            plt.plot(np.array(FP_q_), np.array(TP_q_),label='ROC curve')            
            plt.xlim([-0.05,1.05]); plt.ylim([-0.05,1.05])
            plt.xlabel('FP rate'); plt.ylabel('TP rate')
            plt.show()
            if ex=='auc' or ex=='all':
                AUC=[]
                for i_x in range(TP_q_.size-1):
                    delta_x=FP_q_[i_x+1]-FP_q_[i_x]
                    base=TP_q_[i_x+1]+TP_q_[i_x]
                    AUC.append(base*delta_x/2)
                AUC=np.array(AUC)
                AUC=AUC[AUC>0]
                string_AUC=' + '.join(['%.2f'%auc for auc in AUC])
                if string_AUC!='':
                    string_AUC+=' = '    
                print('\tAUC = %s %.2f\n\n'%(string_AUC, np.sum(AUC)))            
    if ex=='clear':
        return

In [None]:
import matplotlib.pyplot as plt
evaluate('all', Q, R)

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# Datos de ejemplo (debes reemplazarlos con tus propios datos)
documents = data['Text']

# Tokenizar los documentos y crear objetos TaggedDocument
tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(documents)]
print(len(tagged_data))
# Entrenar el modelo Doc2Vec

model = Doc2Vec(window=5, min_count=1, workers=4, epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

document_embeddings = [model.dv[str(i)] for i in range(len(documents))]

# Process user queries
for user_id, interests in user_interests.items():
#     interests preprocesamiento
    # Aggregate word embeddings for the user's interests to generate the query embedding
    interests = lem_stem_vectorizer.lemmatize(''.join(interests))
    #     interests = stemmer.stem(''.join(interests))
#     interests = stemmer.(interests)
    query_embedding = model.infer_vector([interests])
    
    cosine_similarities = cosine_similarity([query_embedding], document_embeddings)[0]

    top_k=10
    top_documents_indices = cosine_similarities.argsort()[-top_k:][::-1]
    top_documents = [(documents[i], cosine_similarities[i]) for i in range(len(top_documents_indices))]
    
    
    # Store relevant documents for evaluation
    user_relevant_documents[user_id] = top_documents
    
    # Print or process relevant documents
    print("User", user_id, "Interests:", interests)
    for doc, score in top_documents:
        print("Document:", doc, "Similarity Score:", score)

# Using word embeddings

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api

# Load pre-trained word embeddings
word_embeddings_model = api.load("word2vec-google-news-300")


     Document: munster switched spain munster heineken quarter final biarritz april switched real sociedad paseo anoeta stadium sebastian real ground hold whereas parc sport aguilera biarritz capacity irish province given least ticket decision move difficult considered fan primary objective said biarritz chairman marcel martin hope rewarded huge crowd behaving best rugby tradition match first heineken fixture played spain expected attract biggest ever attendance rugby match country ulster last irish team play paseo anoeta stadium faced euskarians side season tour Similarity Score: 0.5979421


In [None]:
for user_id, interests in user_interests.items():
    print(interests)

VERSION NUEVA PARA PROBAR LO DE EVALUATE

In [None]:
# Function to generate document embeddings
def generate_document_embeddings(documents, word_embeddings_model):
    document_embeddings = []
    for doc in documents:
        # Aggregate word embeddings (e.g., by averaging)
        words = doc.split()
        embeddings = [word_embeddings_model[word] for word in words if word in word_embeddings_model]
        if embeddings:
            doc_embedding = np.mean(embeddings, axis=0)  # Average embeddings
            document_embeddings.append(doc_embedding)
    return np.array(document_embeddings)

# Function to retrieve relevant documents for a query
def retrieve_documents(query_embedding, document_embeddings, documents, document_ids):
    similarity_scores = cosine_similarity(query_embedding.reshape(1, -1), document_embeddings)[0]
    sorted_documents_indices = similarity_scores.argsort()[::-1]
    sorted_documents = [(document_ids[i], documents[i], similarity_scores[i]) for i in sorted_documents_indices]
    return sorted_documents

In [None]:
# Example documents 
documents = data['Text']
document_ids = data['ArticleId']
# Generate document embeddings
document_embeddings = generate_document_embeddings(documents, word_embeddings_model)

# Process user queries
for user_id, interests in user_interests.items():
    # Aggregate word embeddings for the user's interests to generate the query embedding
    query_embedding = np.mean([word_embeddings_model[word] for interest in interests for word in interest.split() if word in word_embeddings_model], axis=0)
    
    # Retrieve relevant documents for the user's query
    relevant_documents = retrieve_documents(query_embedding, document_embeddings, documents, document_ids)

    # Store relevant documents for evaluation
    user_relevant_documents[user_id] = relevant_documents
    
    # Print or process relevant documents
    print("User", user_id, "Interests:", interests)
    for doc_id, doc, score in relevant_documents:
        print("     Document ID:", doc_id, "Document:", doc, "Similarity Score:", score)

In [None]:
import numpy as np
Q = []
R = []
def get_category(doc_id):
    return data.loc[data['ArticleId'] == doc_id, 'Category'].values[0]

for user_id, interests in user_interests.items():
    print(user_id)
    print(interests)
    # Transform the user's interests into a query vector
    article_ids = [doc_id for doc_id, _, _ in user_relevant_documents[user_id]]
    print(article_ids)
    Q.append(article_ids)
    
    # Extract the ids of the relevant documents for the user
    R.append([1 if any(interest == get_category(doc_id) for interest in interests) else -1 for doc_id, _, _ in user_relevant_documents[user_id]])
    # R.append([1 if interests[0] == get_category(doc_id) else -1 for doc_id, _, _ in user_relevant_documents[user_id]])

In [None]:
for i in range(len(Q)):
    if len(Q[i]) != len(R[i]):
        print(f"Length mismatch at index {i}: len(Q[i]) = {len(Q[i])}, len(R[i]) = {len(R[i])}")

In [None]:
print(len(Q))
print(len(R))

In [None]:

# Now you can call the evaluate function
evaluate('all', Q, R)