## Realizamos todas las importaciones necesarias

In [45]:
import os
import shutil
import re
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import normalize
from sklearn.metrics import jaccard_score

### 1. Primero vamos a transformar todos nuestros archivos con la extension .txt para poder manipularlos

In [1]:
# Specify the path to the folder you want to process
src_folder = 'reuters/test'
# Specify the path to the destination folder
dest_folder = 'reuters/test_txt'

# Create the destination folder if it doesn't exist
os.makedirs(dest_folder, exist_ok=True)

# Iterate over all the files in the specified folder
for filename in os.listdir(src_folder):
    # Get the full path of the file
    src_file_path = os.path.join(src_folder, filename)
        
    # Define the destination file path with .txt extension
    dest_file_path = os.path.join(dest_folder, f"{filename}.txt")
    # Copy the file to the destination folder with .txt extension
    shutil.copy(src_file_path, dest_file_path)
print("Fixed file format")

Fixed file format


2. Ahora vamos a eliminar las stop words de nuestra carpeta ya con la extension correcta

In [6]:
# Load stop words from the stop words file
stop_words_file = 'reuters/stopwords.txt'
with open(stop_words_file, 'r', encoding='utf-8') as file:
    stop_words = set(file.read().split())

# Specify the path to the source folder and the destination folder
src_folder = 'reuters/test_txt'
dest_folder = 'reuters/cleaned_txt'

# Create the destination folder if it doesn't exist
os.makedirs(dest_folder, exist_ok=True)

# Function to read file with different encodings
def read_file_with_encodings(file_path):
    encodings = ['utf-8', 'latin-1', 'iso-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError(f"Failed to decode {file_path} with available encodings")

# Iterate over all the files in the specified folder
for filename in os.listdir(src_folder):
    # Get the full path of the source file
    src_file_path = os.path.join(src_folder, filename)
    
    # Read the content of the file using the function
    try:
        content = read_file_with_encodings(src_file_path)
    except UnicodeDecodeError as e:
        print(e)
        continue
        
    # Remove stop words from the content
    cleaned_content = ' '.join([word for word in content.split() if word.lower() not in stop_words])
        
    # Define the destination file path
    dest_file_path = os.path.join(dest_folder, filename)
        
    # Write the cleaned content to the destination file
    with open(dest_file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)
        
print("Stop words removal and file saving completed.")

Stop words removal and file saving completed.


3. El siguiente paso es eliminar los caracteres especiales

In [5]:
# Directories
input_directory = 'reuters/cleaned_txt'
output_directory = 'reuters/final_txt'

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Function to clean text
def clean_text(text):
    # Remove special characters using regex
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return cleaned_text

# Process each file in the input directory
for filename in os.listdir(input_directory):
    with open(os.path.join(input_directory, filename), 'r', encoding='utf-8') as file:
        content = file.read()
        cleaned_content = clean_text(content)
    
    # Write the cleaned content to a new file in the output directory
    with open(os.path.join(output_directory, filename), 'w', encoding='utf-8') as output_file:
        output_file.write(cleaned_content)

print("All files have been processed and cleaned.")


All files have been processed and cleaned.


4. Creamos el Bag of Words con CountVectorizer de sklearn

In [14]:
# Step 1: Read all .txt files from the directory
directory = 'reuters/final_txt'

filenames = []
all_sentences = []

for filename in os.listdir(directory):
    with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
        content = file.read()
        all_sentences.append(content)
        filenames.append(filename)


print(all_sentences)

# Step 3: Use CountVectorizer to vectorize the text data
# Each row represents a document.
# Each column represents a unique token (word) from the corpus.
# Each entry in the matrix represents the count of the token in the corresponding document.
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(all_sentences)

terms_bow = vectorizer_bow.get_feature_names_out()

#
# print(all_sentences)

# Step 4: Print the resulting bag-of-words array
X_bow = X_bow.toarray()



## Cosine Similarity based on the BoW

In [37]:
# Vectorize the query
query = "1988 prominent local businessman said AR Sendi Sunday Ugandas Ministry Industry supports plan build factory roast grind pack local coffee export Construction start December factory ready October Marketing Ministry authorised Coffee Marketing Board supply company ltUnipack 24000 tonnes beans year processing export Sendi told reporters negotiated 697 mln French franc loan Banque Industrielle dAfrique Oriental Paris Uganda worlds largest coffee producer expects produce 200000 tonnes year Market sources roasted coffee exports benefit Ugandas economy included 238 mln 60 kg bag export quota assigned country latest International Coffee Agreement addition roasted coffee substantially higher unroasted beans"
query_vector = vectorizer_bow.transform([query]).toarray()

print(query_vector)

[[0 0 0 ... 0 0 0]]


In [38]:

# Calculate cosine similarity scores
cosine_sim_scores = cosine_similarity(X_bow, query_vector)

# Create a DataFrame to store document filenames and their similarity scores
similarity_df = pd.DataFrame({'Filename': filenames, 'Cosine_Similarity': cosine_sim_scores.flatten()})

# Sort documents based on similarity scores
similarity_df = similarity_df.sort_values(by='Cosine_Similarity', ascending=False)

# Display the ranked documents
print(similarity_df)


# Print the content of the top three ranked documents
top_ten_filenames = similarity_df.head(10)['Filename'].tolist()

original_directory = 'reuters/test_txt'


for filename in top_ten_filenames:
    with open(os.path.join(original_directory, filename), 'r', encoding='utf-8') as file:
        content = file.read()
        print(f"Document: {filename}")
        print(content)
        print("\n")

       Filename  Cosine_Similarity
351   21567.txt           0.934558
68    16097.txt           0.437057
2563  19387.txt           0.333894
2548  19807.txt           0.314690
1788  15737.txt           0.312185
...         ...                ...
1471  16437.txt           0.000000
1472  19704.txt           0.000000
1484  16406.txt           0.000000
1490  19906.txt           0.000000
0     15603.txt           0.000000

[3019 rows x 2 columns]
Document: 21567.txt
UGANDA PLANS TO EXPORT ROASTED COFFEE TO EUROPE
  Uganda plans to export roasted coffee to
  Europe by the end of 1988, a prominent local businessman said.
      A.R. Sendi said on Sunday that Uganda's Ministry of
  Industry supports his plan to build a factory to roast, grind
  and pack local coffee for export. Construction will start in
  December and the factory should be ready by next October.
      He said the Marketing Ministry has authorised the Coffee
  Marketing Board to supply his company &lt;Unipack> with 24,000
  tonn

5. Vectorizamos los textos usando TFIDF

In [4]:
# Each row represents a document.
# Each column represents a unique token (word) from the corpus.
# Each entry in the matrix represents the TF-IDF score of the token in the corresponding document.
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(all_sentences)


# Print the resulting TF-IDF array
print(X_tfidf.toarray())


# Get feature names (terms)
terms = vectorizer_tfidf.get_feature_names_out()

print(terms)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['0007200' '0008' '001' ... 'zuheir' 'zurich' 'zwermann']


In [5]:
# Convert to DataFrame for readability
tfidf_df = pd.DataFrame(X_tfidf.toarray(), index=filenames, columns=terms)

# Display the TF-IDF array
print("TF-IDF Array:")

tfidf_df

TF-IDF Array:


Unnamed: 0,0007200,0008,001,002,0027,005,0057,00667,007,008,...,zoete,zollinger,zondervan,zondervanltzondo,zone,zones,zoran,zuheir,zurich,zwermann
15603.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15617.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15171.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16478.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19993.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16461.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16307.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20879.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19020.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
