In [1]:
# jupyter nbconvert --to script assignment1Ander.ipynb


In [2]:
!pip install pandas nltk scikit-learn



In [5]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer


In [6]:
dtypes = {
    'ArticleId': 'int32',
    'Text': 'str',
    'Category': 'category'
}
data_train = pd.read_csv('data/BBC News Train.csv', dtype=dtypes, encoding='utf-8', index_col='ArticleId')
data_test = pd.read_csv('data/BBC News Test.csv', dtype={'ArticleId': 'int32', 'Text': 'str'}, encoding='utf-8', index_col='ArticleId')
data_test_solution = pd.read_csv('data/BBC News Sample Solution.csv', dtype={'ArticleId': 'int32', 'Category': 'category'}, encoding='utf-8', index_col='ArticleId')
data_test['Category'] = data_test_solution['Category']
corpus = pd.concat([data_train, data_test])
corpus.head(10)

Unnamed: 0_level_0,Text,Category
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1
1833,worldcom ex-boss launches defence lawyers defe...,business
154,german business confidence slides german busin...,business
1101,bbc poll indicates economic gloom citizens in ...,business
1976,lifestyle governs mobile choice faster bett...,tech
917,enron bosses in $168m payout eighteen former e...,business
1582,howard truanted to play snooker conservative...,politics
651,wales silent on grand slam talk rhys williams ...,sport
1797,french honour for director parker british film...,entertainment
2034,car giant hit by mercedes slump a slump in pro...,business
1866,fockers fuel festive film chart comedy meet th...,entertainment


In [7]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2225 entries, 1833 to 471
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Text      2225 non-null   object  
 1   Category  2225 non-null   category
dtypes: category(1), object(1)
memory usage: 37.1+ KB


In [8]:
corpus.Category.value_counts()

sport            493
business         483
politics         421
entertainment    420
tech             408
Name: Category, dtype: int64

## Preprocessing

### Lowercasing

In [9]:
def lowercase_data(data: pd.DataFrame):
    new_data = data.copy()
    new_data = new_data.str.lower()
    return new_data

### Punctuation Removal

In [10]:
from nltk.tokenize import RegexpTokenizer
def remove_punctuation(data: pd.DataFrame):
    tokenizer = RegexpTokenizer(r'[a-z]+')
    new_data = data.copy()
    new_data = new_data.apply(tokenizer.tokenize)
    return new_data, tokenizer

### Stopwords Removal

In [11]:
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(data: pd.DataFrame):
    stop_words = set(stopwords.words('english'))
    new_data = data.copy()
    new_data = new_data.apply(lambda x: [word for word in tuple(x) if word not in stop_words and len(word) > 3])
    return new_data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Irune\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
def lemmatize_data(data: pd.DataFrame):
    lemmatizer = WordNetLemmatizer()
    new_data = data.copy()
    new_data = new_data.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))
    return new_data, lemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Irune\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Stemming

In [13]:
from nltk.stem import PorterStemmer

nltk.download('punkt')
def stem_data(data: pd.DataFrame):
    stemmer = PorterStemmer()
    new_data = data.copy()
    new_data = new_data.apply(lambda x: ' '.join([stemmer.stem(word) for word in x]))
    return new_data, stemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Irune\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
corpus.columns

Index(['Text', 'Category'], dtype='object')

In [14]:
def preprocessing(corpus):
    data = corpus.copy()
    data = lowercase_data(data)
    data, punctuationRemover = remove_punctuation(data)
    data = remove_stopwords(data)
    
    return data

In [15]:
data = preprocessing(corpus['Text'])

data, lemmatizer = lemmatize_data(data)
# data_stemmed, stemmer = stem_data(data)

# Using tf-idf

4 different users: the first one being interested in politics and soccer, the second in music and films, the third in cars and politics and the fourth in soccer alone.

In [15]:
user_interests = {
    1: ['politics', 'soccer'],
    2: ['music', 'films'],
    3: ['cars', 'politics'],
    4: ['soccer']
}

In [16]:
user_interests = {
    1: ['kennedy'],
    2: ['entertainment'],
    3: ['sports'],
    4: ['tech'],
    5: ['business']
}

In [17]:
data

Unnamed: 0_level_0,Text,Category
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1
1833,worldcom bos launch defence lawyer defending f...,business
154,german business confidence slide german busine...,business
1101,poll indicates economic gloom citizen majority...,business
1976,lifestyle governs mobile choice faster better ...,tech
917,enron boss payout eighteen former enron direct...,business
...,...,...
1923,probe alitalia state european commission offic...,sport
373,play grammy award show irish rock band play li...,tech
1704,sport betting rule spotlight group peer called...,business
206,alfa romeo engine fiat stop making cylinder pe...,entertainment


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine all documents into a single list
all_documents = data

# Vectorize documents using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.05,  stop_words='english', max_features=1000)
document_vectors = vectorizer.fit_transform(all_documents)
len(vectorizer.vocabulary_)

1000

In [19]:


num_documents_to_retrieve = 10  # Number of documents to retrieve for each user
user_relevant_documents = {}
categories = corpus['Category'].unique()

for user_id, interests in user_interests.items():
    relevant_documents = []
    interests = preprocessing(pd.Series(interests))
    for interest in interests:
        interests = lemmatizer.lemmatize(''.join(interest))
    #     interests = stemmer.stem(''.join(interest))
        query_vector = vectorizer.transform(interest)
        similarity_scores = cosine_similarity(query_vector, document_vectors)[0]
        top_documents_indices = similarity_scores.argsort()[-num_documents_to_retrieve:][::-1]
        top_documents = [(corpus.iloc[i]['Text'], similarity_scores[i]) for i in top_documents_indices]
        relevant_documents.extend(top_documents)
    
    # Sort the relevant documents by their similarity scores and select the top 10
    relevant_documents.sort(key=lambda x: x[1], reverse=True)
    top_10_documents = relevant_documents[:num_documents_to_retrieve]
    
    user_relevant_documents[user_id] = top_10_documents

In [136]:
categories

['business', 'tech', 'politics', 'sport', 'entertainment']
Categories (5, object): ['business', 'entertainment', 'politics', 'sport', 'tech']

User 1 Interests: ['politics']
     Document: court halt mark morrison album Similarity Score: 0.5719763
     Document: confusion high definition crit Similarity Score: 0.55491936
     Document: ethiopia crop production ethio Similarity Score: 0.5127802
     Document: detention ruling urged governm Similarity Score: 0.5117885
     Document: santy worm make unwelcome visi Similarity Score: 0.50219214
     Document: ukip candidate suspended euros Similarity Score: 0.5007821
     Document: bosvelt optimistic deal manche Similarity Score: 0.49390337
     Document: adriano chelsea link rejected  Similarity Score: 0.49390337
     Document: copy protection strengthened d Similarity Score: 0.49235088
     Document: italy ireland moment magic bri Similarity Score: 0.481332

In [137]:
# Print or use user_relevant_documents as needed
for user_id, relevant_documents in user_relevant_documents.items():
    print(f"User {user_id} relevant documents:")
    for i, (document, similarity_score) in enumerate(relevant_documents, 1):
        print(f"{i}. Similarity Score: {similarity_score:.4f}\n {document}\n")


User 1 relevant documents:
1. Similarity Score: 0.6906
 dems unveil election slogan liberal democrat present real alternative forthcoming general election campaign charles kennedy said unveiling slogan party spring conference said glass ceiling ambition told delegate labour abused public trust tory failed oppose response conservative insisted party understood forgotten majority speaking harrogate kennedy said people want credible principled political party offer different vision britain liberal democrat stood iraq said also provided strong opposition government plan card anti terror measure taxation said voted conservative good vote wasted people needed party listening concern party prepared stand party said prime minister responding claim tory party chairman liam said like labour dems soft crime support higher tax oppose controlled immigration support giving europe control life kennedy also outlined party plan impose income rate earnings year money would used help policy abolishing un

In [138]:
# Print or use user_relevant_documents as needed
for user_id, relevant_documents in user_relevant_documents.items():
    print(f"User {user_id} relevant documents:")
    for i, (document, similarity_score) in enumerate(relevant_documents, 1):
        print(f"{i}. Similarity Score: {similarity_score:.4f}\n {document}\n")


User 1 relevant documents:
1. Similarity Score: 0.6906
 dems unveil election slogan liberal democrat present real alternative forthcoming general election campaign charles kennedy said unveiling slogan party spring conference said glass ceiling ambition told delegate labour abused public trust tory failed oppose response conservative insisted party understood forgotten majority speaking harrogate kennedy said people want credible principled political party offer different vision britain liberal democrat stood iraq said also provided strong opposition government plan card anti terror measure taxation said voted conservative good vote wasted people needed party listening concern party prepared stand party said prime minister responding claim tory party chairman liam said like labour dems soft crime support higher tax oppose controlled immigration support giving europe control life kennedy also outlined party plan impose income rate earnings year money would used help policy abolishing un

In [24]:
from evaltools import evaluate, gen_eval_dataset

TypeError: 'type' object is not subscriptable

evaluar cin la funcion del profesor

In [25]:
# Now, let's evaluate the recommended documents for each user
for user_id, relevant_documents in user_relevant_documents.items():
    # Extracting query and relevance judgements for evaluation
    Q, R = gen_eval_dataset(len(relevant_documents), num_documents_to_retrieve)
    
    # Evaluate the recommended documents
    evaluate('all', Q, R)

NameError: name 'gen_eval_dataset' is not defined

# Using word embeddings

In [20]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# Datos de ejemplo (debes reemplazarlos con tus propios datos)
documents = all_documents

# Tokenizar los documentos y crear objetos TaggedDocument
tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(documents)]
# Entrenar el modelo Doc2Vec

model = Doc2Vec(window=5, min_count=1, workers=4, epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

document_embeddings = [model.docvecs[str(i)] for i in range(len(documents))]

# Process user queries
for user_id, interests in user_interests.items():
#     interests preprocesamiento
    # Aggregate word embeddings for the user's interests to generate the query embedding
    interests = lemmatizer.lemmatize(''.join(interests))
    #     interests = stemmer.stem(''.join(interests))
#     interests = stemmer.(interests)
    query_embedding = model.infer_vector([interests])
    
    cosine_similarities = cosine_similarity([query_embedding], document_embeddings)[0]

    top_k=10
    top_documents_indices = cosine_similarities.argsort()[-top_k:][::-1]
    top_documents = [(documents[i], similarity_scores[i]) for i in top_documents_indices]
    
    
    # Store relevant documents for evaluation
    user_relevant_documents[user_id] = top_documents
    
    # Print or process relevant documents
    print("User", user_id, "Interests:", interests)
    for doc, score in relevant_documents:
        print("Document:", doc, "Similarity Score:", score)

# WORD EMBERDDINGS

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api

# Load pre-trained word embeddings
word_embeddings_model = api.load("word2vec-google-news-300")


In [26]:
# Function to generate document embeddings
def generate_document_embeddings(documents, word_embeddings_model):
    document_embeddings = []
    for doc in documents:
        # Aggregate word embeddings (e.g., by averaging)
        words = doc.split()
        embeddings = [word_embeddings_model[word] for word in words if word in word_embeddings_model]
        if embeddings:
            doc_embedding = np.mean(embeddings, axis=0)  # Average embeddings
            document_embeddings.append(doc_embedding)
    return np.array(document_embeddings)

# Function to retrieve relevant documents for a query
def retrieve_documents(query_embedding, document_embeddings, documents, top_k=10):
    similarity_scores = cosine_similarity(query_embedding.reshape(1, -1), document_embeddings)[0]
    top_documents_indices = similarity_scores.argsort()[-top_k:][::-1]
    top_documents = [(documents[i], similarity_scores[i]) for i in top_documents_indices]
    return top_documents

In [28]:

# Example documents 
documents = data['Text']
# Generate document embeddings
document_embeddings = generate_document_embeddings(documents, word_embeddings_model)

# Process user queries
for user_id, interests in user_interests.items():
    # Aggregate word embeddings for the user's interests to generate the query embedding
    query_embedding = np.mean([word_embeddings_model[word] for interest in interests for word in interest.split() if word in word_embeddings_model], axis=0)
    
    # Retrieve relevant documents for the user's query
    relevant_documents = retrieve_documents(query_embedding, document_embeddings, documents)

    # Store relevant documents for evaluation
    user_relevant_documents[user_id] = relevant_documents
    
    # Print or process relevant documents
    print("User", user_id, "Interests:", interests)
    for doc, score in relevant_documents:
        print("     Document:", doc, "Similarity Score:", score)
    

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2225,) + inhomogeneous part.

     Document: munster switched spain munster heineken quarter final biarritz april switched real sociedad paseo anoeta stadium sebastian real ground hold whereas parc sport aguilera biarritz capacity irish province given least ticket decision move difficult considered fan primary objective said biarritz chairman marcel martin hope rewarded huge crowd behaving best rugby tradition match first heineken fixture played spain expected attract biggest ever attendance rugby match country ulster last irish team play paseo anoeta stadium faced euskarians side season tour Similarity Score: 0.5979421


In [None]:
user_relevant_documents

In [None]:
for user_id, interests in user_interests.items():
    print(interests)

In [None]:
from evaltools import evaluate
# Initialize Q and R
Q = []
R = []

# Process user queries
for user_id, interests in user_interests.items():
    # Aggregate word embeddings for the user's interests to generate the query embedding
    query_embedding = np.mean([word_embeddings_model[word] for interest in interests for word in interest.split() if word in word_embeddings_model], axis=0)
    
    # Retrieve relevant documents for the user's query
    relevant_documents = retrieve_documents(query_embedding, document_embeddings, documents)

    # Store relevant documents for evaluation
    user_relevant_documents[user_id] = relevant_documents
    
    # Add user's interests to Q
    Q.append(interests)
    
    # Add relevant documents to R
    R.append([score for doc, score in relevant_documents])
# Now you can call the evaluate function
evaluate('all', Q, R)