In [1]:
import re
import os
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from collections import defaultdict

In [2]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [3]:
#REad the stop words list
stop_words_paht='data/reuters/stopwords'
stop_words=[]
with open(stop_words_paht,'r') as file:
    content=file.read().split('\n')
    stop_words.append(content)
    

In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
data_directory = 'data/reuters/training/'
filenames=os.listdir(data_directory)
filenames.sort(key=lambda x: int(x))

In [8]:
word_counts = defaultdict(int)
wset = set()
nltk.download('stopwords')
stop_words2 = set(stopwords.words('english'))
stem_docs = {}
real_doc={}

def load_data2():
    for filename in filenames:
        with open(os.path.join(data_directory, filename), 'r') as file:
            content = file.read()
            real_doc[filename]=content
            content = normalize_text(content)
            sdoc = []
            for word in content.lower().translate(str.maketrans('', '', string.punctuation)).split(" "):
                if word not in stop_words[0] or word not in stop_words2:
                    word = lemmatizer.lemmatize(word)
                    if word not in stop_words[0] or word not in stop_words2:
                        wset.add(word)
                        word_counts[word] += 1
                        sdoc.append(word) 
            sdoc_concatenates=" ".join(sdoc)
            stem_docs[filename] = sdoc_concatenates  
                                                 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jeffersonc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
load_data2()
# Crear listas para almacenar los datos del DataFrame
filenames_list = []
original_text_list = []
stemmed_text_list = []

# Iterar sobre cada nombre de archivo y sus correspondientes textos originales y procesados
for filename, original_text in real_doc.items():
    filenames_list.append(filename)
    original_text_list.append(original_text)
    stemmed_text_list.append(stem_docs[filename])

# Crear el DataFrame
df = pd.DataFrame({
    'filename': filenames_list,
    'original_text': original_text_list,
    'stemmed_text': stemmed_text_list
})
df

Unnamed: 0,filename,original_text,stemmed_text
0,1,BAHIA COCOA REVIEW\n Showers continued throug...,bahia cocoa review shower continued throughout...
1,5,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESER...,national average price farmerowned reserve u a...
2,6,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS\...,argentine grainoilseed registration argentine ...
3,9,CHAMPION PRODUCTS &lt;CH> APPROVES STOCK SPLIT...,champion product ltch approves stock split cha...
4,10,COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES ...,computer terminal system ltcpml completes sale...
...,...,...,...
7764,14779,BANK OF JAPAN INTERVENES SOON AFTER TOKYO OPEN...,bank japan intervenes soon tokyo opening bank ...
7765,14783,JAPAN RUBBER STOCKS FALL IN MARCH\n Japan's r...,japan rubber stock fall march japan rubber sto...
7766,14785,SOUTH KOREAN WON FIXED AT 25-MONTH HIGH\n THE...,south korean won fixed month high bank korea s...
7767,14805,NIPPON MINING LOWERS COPPER PRICE\n Nippon Mi...,nippon mining lower copper price nippon mining...


In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Creamos una instancia de CountVectorizer para BoW
count_vectorizer = CountVectorizer(binary=True)
tfidf_vectorizer = TfidfVectorizer()
# Aplicar Bag of Words (BoW)
bow_matrix = count_vectorizer.fit_transform(df['stemmed_text'])
bow_feature_names = count_vectorizer.get_feature_names_out()
# Aplicar TF-IDF 
tfidf_matrix = tfidf_vectorizer.fit_transform(df['stemmed_text']) 
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [12]:
count_vectorizer.vocabulary_

{'bahia': 1653,
 'cocoa': 3711,
 'review': 20937,
 'shower': 22134,
 'continued': 4289,
 'throughout': 24122,
 'week': 25723,
 'zone': 26335,
 'alleviating': 637,
 'drought': 6023,
 'since': 22269,
 'early': 6156,
 'january': 10561,
 'improving': 9805,
 'prospect': 19593,
 'coming': 3846,
 'temporao': 23903,
 'although': 722,
 'normal': 17431,
 'humidity': 9518,
 'level': 11444,
 'restored': 20833,
 'comissaria': 3847,
 'smith': 22447,
 'said': 21380,
 'weekly': 25728,
 'dry': 6035,
 'period': 18633,
 'mean': 16038,
 'late': 11252,
 'year': 26204,
 'arrival': 1239,
 'ended': 6522,
 'february': 7307,
 'bag': 1647,
 'kilo': 10936,
 'making': 15653,
 'cumulative': 4765,
 'total': 24326,
 'season': 21691,
 'mln': 16521,
 'stage': 22861,
 'last': 11243,
 'seems': 21763,
 'delivered': 5170,
 'earlier': 6154,
 'consignment': 4181,
 'included': 9853,
 'figure': 7434,
 'still': 23063,
 'doubt': 5918,
 'much': 16790,
 'old': 17787,
 'crop': 4672,
 'available': 1540,
 'harvesting': 8982,
 'practi

In [13]:
query = "income"  # Ejemplo de consulta
query_bow = count_vectorizer.transform([query])  #vectorizamos al query 

In [15]:
def similitud_jaccard(query, text):
    interseccion = np.sum(np.logical_and(query, text))
    union = np.sum(np.logical_or(query, text))
    jaccard_score = interseccion / union if union != 0 else 0.0
    return jaccard_score

import time
start=time.time()
jaccard_similarities1 = []
for idx in range(bow_matrix.shape[0]):
    
    a=query_bow.toarray().squeeze()
    b=bow_matrix[idx].toarray().squeeze()
    similarity=similitud_jaccard(a,b)

    jaccard_similarities1.append(similarity)
end=time.time()
print(end-start)




1.3969545364379883


In [16]:
sorted_indices1 = np.argsort(jaccard_similarities1)[::-1]

In [17]:
i=0
for idx in sorted_indices1:
    filename=df['filename'].iloc[idx]
    texto = df['stemmed_text'].iloc[idx]
    print(f"Texto #{filename} - Similitud Jaccard: {jaccard_similarities1[idx]}")
    print(texto)
    print("-----------------------------------------------------")
    if i>5:
        break
    else:
        i+=1

Texto #7098 - Similitud Jaccard: 0.16666666666666666
u personal income rose pct february spending pct u personal income rose pct february spending pct
-----------------------------------------------------
Texto #8016 - Similitud Jaccard: 0.08333333333333333
montgomery street income ltmts monthly dividend mthly div ct v ct pay april record april
-----------------------------------------------------
Texto #508 - Similitud Jaccard: 0.07692307692307693
ltfranklin california taxfree income fundpayout mthly div ct v ct prior pay march record march two
-----------------------------------------------------
Texto #512 - Similitud Jaccard: 0.07142857142857142
ltfranklin federal taxfree income fund payout mthly div ct v ct prior pay march record march two
-----------------------------------------------------
Texto #513 - Similitud Jaccard: 0.06666666666666667
ltfranklin new york taxfree income fund payout mthly div ct v ct prior pay march record march two
-----------------------------------------

In [None]:
query_tfidf =tfidf_vectorizer.transform([query]) #vectorizamos a tf-idf la query

# Cosine similarity using numpy
def cosine_sim(a,b):
    if np.linalg.norm(a)*np.linalg.norm(b)>0.0:        
        return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    else:
        return 0.0
    
cosine_distances_list = []
for idx in range(bow_matrix.shape[0]):
    distance=cosine_sim(bow_matrix[idx].toarray().squeeze(),query_tfidf.toarray().squeeze())
    
    cosine_distances_list.append(distance)
    
    
sorted_indices_cos = np.argsort(cosine_distances_list)[::-1]


i=0
print("Query: ",query)
for idx2 in sorted_indices_cos:
    if cosine_distances_list[idx2]>0.0:
        filename=df['filename'].iloc[idx2]
        texto = df['stemmed_text'].iloc[idx2]
        print(f"Texto #{filename} - Distancia coseno: {cosine_distances_list[idx2]}")
        print(texto)
        print("-----------------------------------------------------")
    else:
        break
   

In [19]:
archivo_resultados = 'data/reuters/cats.txt'

consultas_resultados = {}

with open(archivo_resultados, 'r') as file:
    for line in file:
        line = line.strip()
        
        if line.startswith('training'):
            parts = line.split()
            archivo = parts[0]
            archivo= archivo.split('/')[1].strip().split()[0]
            consulta = ' '.join(parts[1:])
            if consulta in consultas_resultados:
                consultas_resultados[consulta].add(archivo)
            else:
                consultas_resultados[consulta] = {archivo}
                


In [21]:


def query_to_vectorBow(query):
    return count_vectorizer.transform([query])
    
    
def similitud_jaccard(query, text):
    interseccion = np.sum(np.logical_and(query, text))
    union = np.sum(np.logical_or(query, text))
    jaccard_score = interseccion / union if union != 0 else 0.0
    return jaccard_score


def calculate_jaccard(query):
    #start=time.time()
    jaccard_similarities1 = []
    best_titles_jaccard=[]
    query_bow=query_to_vectorBow(query)
    for idx in range(bow_matrix.shape[0]):        
        a=query_bow.toarray().squeeze()
        b=bow_matrix[idx].toarray().squeeze()
        similarity=similitud_jaccard(a,b)

        jaccard_similarities1.append(similarity)
    sorted_indices = np.argsort(jaccard_similarities1)[::-1]
    for idx in sorted_indices:
        if jaccard_similarities1[idx]>0.0:
            filename=df['filename'].iloc[idx]
            best_titles_jaccard.append(filename)
    return best_titles_jaccard
        
    #end=time.time()
    #print(end-start)
    

def get_true_positives(predicted, truth):
    true_positives = 0
    for value in predicted:
        if value in truth:
            true_positives += 1

    #print("True positives", true_positives)
    return true_positives


def get_false_negatives(predicted, truth):
    set_predicted = set(predicted)
    set_verdaderos = set(truth)
    false_negatives_list = list(set_verdaderos-set_predicted)
    false_negatives = len(false_negatives_list)

    #print("False negatives", false_negatives)
    return false_negatives


def get_false_positive(predicted, truth):
    set_predicted = set(predicted)
    set_verdaderos = set(truth)
    false_positives_list = list(set_predicted-set_verdaderos)
    false_positives = len(false_positives_list)
    #print("False positives", false_positives)

    return false_positives


In [23]:
#Para jaccard usando BoW
recall_results=[]
precision_results=[]
f1_score_results=[]
lemmatizer = WordNetLemmatizer()
for key in consultas_resultados:
    #print("query: ", key)
    truth=consultas_resultados[key]
    key_lematized=lemmatizer.lemmatize(key)
    #print("key_lematized: ", key_lematized)
    best_titles_jaccard=calculate_jaccard(key_lematized)
    true_positives=get_true_positives(best_titles_jaccard,truth)
    false_negatives=get_false_negatives(best_titles_jaccard,truth)
    false_positives=get_false_positive(best_titles_jaccard,truth)
    if true_positives + false_negatives == 0:
        recall = 0  
    else:
        recall = true_positives / (true_positives + false_negatives)
    
    recall_results.append(recall)
    
    if true_positives + false_positives == 0:
        precision = 0  
    else:
        precision = true_positives / (true_positives + false_positives)
    
    precision_results.append(precision)
    
    if ((2*true_positives)+false_negatives+false_positives)==0:
        f1_score=0
    else:
        f1_score=(2*true_positives)/((2*true_positives)+false_negatives+false_positives)
    f1_score_results.append(f1_score)
    

promedio_recall=sum(recall_results)/len(recall_results)
promedio_precision=sum(precision_results)/len(precision_results)    
promedio_f1_score=sum(f1_score_results)/len(f1_score_results)

print(f"Promedio de Recall jaccard: {promedio_recall}")
print(f"Promedio de Precisión jaccard: {promedio_precision}")
print(f"Promedio f1 score jaccard: {promedio_f1_score}")    

Promedio de Recall jaccard: 0.8378790622708628
Promedio de Precisión jaccard: 0.03297630019929637
Promedio f1 score jaccard: 0.04514864039827736


In [24]:
# Cosine similarity using numpy
def cosine_sim(a,b):
    if np.linalg.norm(a)*np.linalg.norm(b)>0.0:        
        return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    else:
        return 0.0
    
    
def calculate_cosine(query):
    #start=time.time()
    distances = []
    best_titles_cosine=[]
    query_bow=query_to_vectorBow(query)
    for idx in range(bow_matrix.shape[0]):        
        a=query_bow.toarray().squeeze()
        b=bow_matrix[idx].toarray().squeeze()
        similarity=cosine_sim(a,b)

        distances.append(similarity)
    sorted_indices = np.argsort(jaccard_similarities1)[::-1]
    for idx in sorted_indices:
        if distances[idx]>0.0:
            filename=df['filename'].iloc[idx]
            best_titles_cosine.append(filename)
    return best_titles_cosine

In [25]:
#Para TF-IDF con cosine
recall_results=[]
precision_results=[]
f1_score_results=[]
lemmatizer = WordNetLemmatizer()
for key in consultas_resultados:
    truth=consultas_resultados[key]
    key_lematized=lemmatizer.lemmatize(key)
    best_titles=calculate_cosine(key_lematized)
    true_positives=get_true_positives(best_titles,truth)
    false_negatives=get_false_negatives(best_titles,truth)
    false_positives=get_false_positive(best_titles,truth)
    if true_positives + false_negatives == 0:
        recall = 0  
    else:
        recall = true_positives / (true_positives + false_negatives)
    
    recall_results.append(recall)
    
    if true_positives + false_positives == 0:
        precision = 0  
    else:
        precision = true_positives / (true_positives + false_positives)
    
    precision_results.append(precision)
    
    if ((2*true_positives)+false_negatives+false_positives)==0:
        f1_score=0
    else:
        f1_score=(2*true_positives)/((2*true_positives)+false_negatives+false_positives)
    f1_score_results.append(f1_score)
    

promedio_recall=sum(recall_results)/len(recall_results)
promedio_precision=sum(precision_results)/len(precision_results)    
promedio_f1_score=sum(f1_score_results)/len(f1_score_results)

print(f"Promedio de Recall cosine: {promedio_recall}")
print(f"Promedio de Precisión cosine: {promedio_precision}")
print(f"Promedio f1 score cosine: {promedio_f1_score}")
    

Promedio de Recall cosine: 0.8378790622708628
Promedio de Precisión cosine: 0.03297630019929637
Promedio f1 score cosine: 0.04514864039827736
