## Realizamos todas las importaciones necesarias

In [32]:
import os
import shutil
import re
import os
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import spacy
import snowballstemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score

### 1. Primero vamos a transformar todos nuestros archivos con la extension .txt para poder manipularlos

In [2]:
# Specify the path to the folder you want to process
src_folder = 'reuters/training'
# Specify the path to the destination folder
dest_folder = 'reuters/training_txt'

# Create the destination folder if it doesn't exist
os.makedirs(dest_folder, exist_ok=True)

# Iterate over all the files in the specified folder
for filename in os.listdir(src_folder):
    # Get the full path of the file
    src_file_path = os.path.join(src_folder, filename)
    # Define the destination file path with .txt extension
    dest_file_path = os.path.join(dest_folder, f"{filename}.txt")
    # Copy the file to the destination folder with .txt extension
    shutil.copy(src_file_path, dest_file_path)
print("Fixed file format")

Fixed file format


### 2. Ahora vamos a eliminar las stop words de nuestra carpeta ya con la extension correcta

In [3]:
# Load stop words from the stop words file
stop_words_file = 'reuters/stopwords.txt'
with open(stop_words_file, 'r', encoding='utf-8') as file:
    stop_words = set(file.read().split())

# Specify the path to the source folder and the destination folder
src_folder = 'reuters/training_txt'
dest_folder = 'reuters/training_stop_words_txt'

# Create the destination folder if it doesn't exist
os.makedirs(dest_folder, exist_ok=True)

# Function to read file with different encodings
def read_file_with_encodings(file_path):
    encodings = ['utf-8', 'latin-1', 'iso-8859-1']
    for encoding in encodings:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()

# Iterate over all the files in the specified folder
for filename in os.listdir(src_folder):
    # Get the full path of the source file
    src_file_path = os.path.join(src_folder, filename)
    
    # Read the content of the file using the function
    content = read_file_with_encodings(src_file_path)
        
    # Remove stop words from the content
    cleaned_content = ' '.join([word for word in content.split() if word.lower() not in stop_words])
        
    # Define the destination file path
    dest_file_path = os.path.join(dest_folder, filename)
        
    # Write the cleaned content to the destination file
    with open(dest_file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)
        
print("Stop words removal and file saving completed.")

Stop words removal and file saving completed.


### 3. El siguiente paso es eliminar los caracteres especiales

In [4]:
# Directories
input_directory = 'reuters/training_stop_words_txt'
output_directory = 'reuters/final_txt'

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Function to clean text
def clean_text(text):
    # Remove special characters using regex
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return cleaned_text

# Process each file in the input directory
for filename in os.listdir(input_directory):
    with open(os.path.join(input_directory, filename), 'r', encoding='utf-8') as file:
        content = file.read()
        cleaned_content = clean_text(content)
    
    # Write the cleaned content to a new file in the output directory
    with open(os.path.join(output_directory, filename), 'w', encoding='utf-8') as output_file:
        output_file.write(cleaned_content)

print("All files have been processed and cleaned.")


All files have been processed and cleaned.


### 4. Stematizar y lematizar 

In [5]:
# Cargar el modelo de lenguaje de spaCy
nlp = spacy.load('en_core_web_sm')

# Inicializar el stemmer
stemmer = snowballstemmer.stemmer('english')

In [7]:
# Función para lematizar y stematizar texto
def preprocess_text(text):
    doc = nlp(text)
    stemmed_tokens = [stemmer.stemWord(token.text) for token in doc]
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(stemmed_tokens), ' '.join(lemmatized_tokens)


# Directorios de entrada y salida
input_dir = 'reuters/final_txt'
output_dir_stemmed = 'final/Stemmed'
output_dir_lemmatized = 'final/lemmatized'

# Crear directorios de salida si no existen
os.makedirs(output_dir_stemmed, exist_ok=True)
os.makedirs(output_dir_lemmatized, exist_ok=True)

# Procesar cada archivo en el directorio de entrada
for filename in os.listdir(input_dir):
    if filename.endswith('.txt'):
        with open(os.path.join(input_dir, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            stemmed_text, lemmatized_text = preprocess_text(text)
        
        # Guardar texto stematizado
        with open(os.path.join(output_dir_stemmed, filename), 'w', encoding='utf-8') as file:
            file.write(stemmed_text)
        
        # Guardar texto lematizado
        with open(os.path.join(output_dir_lemmatized, filename), 'w', encoding='utf-8') as file:
            file.write(lemmatized_text)

print("Procesamiento completado.")


Procesamiento completado.


## 5. Leer archivos limpios

Ahora que ya tenemos las dos carpetas limpias vamos a leer cada una y guardar para poder hacer BoW y TF-IDF

### 5.1 Carpeta de Lemmatized

In [9]:
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Directorio de la carpeta que contiene los archivos .txt
src_folder = 'final/lemmatized'

# Función para leer archivos con diferentes codificaciones
def read_file_with_encodings(file_path):
    encodings = ['utf-8', 'latin-1', 'iso-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except:
            continue
    raise ValueError(f"Unable to read the file {file_path} with available encodings.")

# Leer todos los archivos .txt en la carpeta
documents_lemmatized = []
filenames_lemmatized = []
for filename in os.listdir(src_folder):
    if filename.endswith('.txt'):
        file_path = os.path.join(src_folder, filename)
        content = read_file_with_encodings(file_path)
        documents_lemmatized.append(content)
        filenames_lemmatized.append(filename)

# Verificar que se hayan leído los archivos
print(f"Archivos leídos: {len(documents_lemmatized)}")


Archivos leídos: 7769


### 5.2 Carpeta de Stemmed

In [11]:
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Directorio de la carpeta que contiene los archivos .txt
src_folder = 'final/Stemmed'

# Función para leer archivos con diferentes codificaciones
def read_file_with_encodings(file_path):
    encodings = ['utf-8', 'latin-1', 'iso-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except:
            continue
    raise ValueError(f"Unable to read the file {file_path} with available encodings.")

# Leer todos los archivos .txt en la carpeta
documents_stemmed = []
filenames_stemmed = []
for filename in os.listdir(src_folder):
    if filename.endswith('.txt'):
        file_path = os.path.join(src_folder, filename)
        content = read_file_with_encodings(file_path)
        documents_stemmed.append(content)
        filenames_stemmed.append(filename)

# Verificar que se hayan leído los archivos
print(f"Archivos leídos: {len(documents_stemmed)}")


Archivos leídos: 7769


## 6.Vectorización utilizando Bag of Words

In [13]:
# Función para vectorizar textos utilizando Bag of Words y TF-IDF
def Bag_of_Words(texts):
    # Vectorización usando Bag of Words
    vectorizer = CountVectorizer()
    X_bow = vectorizer.fit_transform(texts)

    return X_bow, vectorizer

### 6.1 Utilizando la carpeta Lemmatized

In [26]:
# Vectorizar los documentos
X_bow_lemmatized, bow_vectorizer_lemmatized = Bag_of_Words(documents_lemmatized)

# Ver los resultados de Bag of Words
print("Bag of Words (BoW) Lemmatized:")
print(X_bow_lemmatized.toarray())
print("Caracteristicas de BoW Lemmatized:", bow_vectorizer_lemmatized.get_feature_names_out())


Bag of Words (BoW) Lemmatized:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Caracteristicas de BoW Lemmatized: ['000' '0006913' '0006916' ... 'zuyuan' 'zverev' 'zzzz']


### 6.2 Utilizando la carpeta Stemmed

In [27]:
# Vectorizar los documentos
X_bow_stemmed, bow_vectorizer_stemmed = Bag_of_Words(documents_stemmed)

# Ver los resultados de Bag of Words
print("Bag of Words (BoW) Stemmed:")
print(X_bow_stemmed.toarray())
print("Caracteristicas de BoW Stemmed:", bow_vectorizer_stemmed.get_feature_names_out())

Bag of Words (BoW) Stemmed:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Caracteristicas de BoW Stemmed: ['000' '0006913' '0006916' ... 'zuyuan' 'zverev' 'zzzz']


## 7.Vectorización utilizando TF-IDF

In [19]:
# Función para vectorizar textos utilizando Bag of Words y TF-IDF
def TF_IDF(texts):
    # Vectorización usando TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    X_tfidf = tfidf_vectorizer.fit_transform(texts)
    
    return X_tfidf, tfidf_vectorizer

### 7.1 Utilizando la carpeta Lemmatized

In [28]:
# Vectorizar los documentos
X_tfidf_lemmatized, tfidf_vectorizer_lemmatized = TF_IDF(documents_lemmatized)

# Ver los resultados de TF-IDF
print("TF-IDF Lemmatized:")
print(X_tfidf_lemmatized.toarray())
print("Caracteristicas de TF-IDF Lemmatized:", tfidf_vectorizer_lemmatized.get_feature_names_out())

TF-IDF Lemmatized:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Caracteristicas de TF-IDF Lemmatized: ['000' '0006913' '0006916' ... 'zuyuan' 'zverev' 'zzzz']


### 7.1 Utilizando la carpeta Stemmed

In [29]:
# Vectorizar los documentos
X_tfidf_Stemmed, tfidf_vectorizer_Stemmed = TF_IDF(documents_stemmed)

# Ver los resultados de TF-IDF
print("TF-IDF Stemmed:")
print(X_tfidf_Stemmed.toarray())
print("Caracteristicas de TF-IDF Stemmed:", tfidf_vectorizer_Stemmed.get_feature_names_out())

TF-IDF Stemmed:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Caracteristicas de TF-IDF Stemmed: ['000' '0006913' '0006916' ... 'zuyuan' 'zverev' 'zzzz']


## 8. Ahora todo vamos a transformar a 4 dataframes

In [30]:
## Bag of Word
df_bow_lemmatized = pd.DataFrame(X_bow_lemmatized.toarray(), columns=bow_vectorizer_lemmatized.get_feature_names_out(), index=filenames_lemmatized)
df_bow_stemmed = pd.DataFrame(X_bow_stemmed.toarray(), columns=bow_vectorizer_stemmed.get_feature_names_out(), index=filenames_stemmed)

# TF_IDF
df_tfidf_lemmatized = pd.DataFrame(X_tfidf_lemmatized.toarray(), columns=tfidf_vectorizer_lemmatized.get_feature_names_out(), index=filenames_lemmatized)
df_tfidf_stemmed = pd.DataFrame(X_tfidf_Stemmed.toarray(), columns=tfidf_vectorizer_Stemmed.get_feature_names_out(), index=filenames_stemmed)


## 9. Query

In [43]:
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(documents_lemmatized)

In [44]:
query = "NIPPON MINING LOWERS COPPER PRICE Nippon Mining Co Ltd said it lowered its selling price for electrolytic copper by 10,000 yen per tonne to 260,000, effective immediately."

query_vector = vectorizer_bow.transform([query])
print(query_vector)

  (0, 0)	2
  (0, 166)	1
  (0, 4788)	1
  (0, 14852)	1
  (0, 15793)	1
  (0, 16453)	2
  (0, 18279)	1
  (0, 18357)	1
  (0, 19659)	1
  (0, 21682)	1
  (0, 22413)	1
  (0, 22432)	1
  (0, 23768)	1
  (0, 24697)	1
  (0, 28442)	2
  (0, 29268)	2
  (0, 30667)	1
  (0, 31444)	2
  (0, 33323)	1
  (0, 33772)	1
  (0, 36133)	1
  (0, 36192)	1
  (0, 38123)	1


## 10. Jaccard con Bag of Words

### 10.1 Para la carpete Lemmatized

In [49]:
# Calcular coeficiente de Jaccard entre los conjuntos
jaccard_coefficient_lemmatized = jaccard_score(X_bow, query_vector)

ValueError: Found input variables with inconsistent numbers of samples: [7769, 1]

4. Creamos el Bag of Words con CountVectorizer de sklearn

In [None]:
# Step 1: Read all .txt files from the directory
directory = 'reuters/final_txt'

filenames = []
all_sentences = []

for filename in os.listdir(directory):
    with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
        content = file.read()
        all_sentences.append(content)
        filenames.append(filename)


print(all_sentences)

# Step 3: Use CountVectorizer to vectorize the text data
# Each row represents a document.
# Each column represents a unique token (word) from the corpus.
# Each entry in the matrix represents the count of the token in the corresponding document.
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(all_sentences)

terms_bow = vectorizer_bow.get_feature_names_out()

#
# print(all_sentences)

# Step 4: Print the resulting bag-of-words array
X_bow = X_bow.toarray()

## Cosine Similarity based on the BoW

In [None]:

# Calculate cosine similarity scores
cosine_sim_scores = cosine_similarity(X_bow, query_vector)

# Create a DataFrame to store document filenames and their similarity scores
similarity_df = pd.DataFrame({'Filename': filenames, 'Cosine_Similarity': cosine_sim_scores.flatten()})

# Sort documents based on similarity scores
similarity_df = similarity_df.sort_values(by='Cosine_Similarity', ascending=False)

# Display the ranked documents
print(similarity_df)


# Print the content of the top three ranked documents
top_ten_filenames = similarity_df.head(10)['Filename'].tolist()

original_directory = 'reuters/test_txt'


for filename in top_ten_filenames:
    with open(os.path.join(original_directory, filename), 'r', encoding='utf-8') as file:
        content = file.read()
        print(f"Document: {filename}")
        print(content)
        print("\n")

5. Vectorizamos los textos usando TFIDF

In [None]:
# Each row represents a document.
# Each column represents a unique token (word) from the corpus.
# Each entry in the matrix represents the TF-IDF score of the token in the corresponding document.
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(all_sentences)


# Print the resulting TF-IDF array
print(X_tfidf.toarray())


# Get feature names (terms)
terms = vectorizer_tfidf.get_feature_names_out()

print(terms)

In [None]:
# Convert to DataFrame for readability
tfidf_df = pd.DataFrame(X_tfidf.toarray(), index=filenames, columns=terms)

# Display the TF-IDF array
print("TF-IDF Array:")

tfidf_df