In [43]:
import json
import re

# Open and read file 
with open('corpus.json', 'r', encoding='utf-8') as file:
    content = file.read()

# Remove control characters using regex
cleaned_content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)

try:
    data = json.loads(cleaned_content)
    print("JSON loaded successfully!")

except json.JSONDecodeError as e:
    print(f"Having trouble loading JSON: {e}")

    error_location = e.pos
    start = max(0, error_location - 50)
    end = min(len(cleaned_content), error_location + 50)
    print(f"Context around the error: '{cleaned_content[start:end]}'")

JSON loaded successfully!


In [None]:
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Supondo que 'data' já foi carregado anteriormente
tokenized_articles = [article.get('artigo_tokenizado', []) for article in data]
text_articles = [' '.join(tokens) for tokens in tokenized_articles]

# 1. Bag of Words (BoW)
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(text_articles)
vocabulary_bow = bow_vectorizer.get_feature_names_out()
word_to_index_bow = {word: i for i, word in enumerate(vocabulary_bow)}

# 2. TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text_articles)
vocabulary_tfidf = tfidf_vectorizer.get_feature_names_out()
word_to_index_tfidf = {word: i for i, word in enumerate(vocabulary_tfidf)}

# 3. One-Hot
vocabulary_onehot = sorted({w for tokens in tokenized_articles for w in tokens})
word_to_index_onehot = {w: i for i, w in enumerate(vocabulary_onehot)}
identity_matrix = np.eye(len(vocabulary_onehot))

# Funções para obter top-N similares
def get_top_similar_tokens(matrix, vocabulary, word_to_index, token, top_n=10):
    if token not in word_to_index:
        return []
    
    # Para One-Hot Encoding, 'matrix' já é uma matriz densa, então não há necessidade de toarray()
    word_vectors = matrix.T  # Aqui transposta diretamente
    idx = word_to_index[token]
    vec = word_vectors[idx].reshape(1, -1)
    sims = cosine_similarity(vec, word_vectors)[0]
    
    # Ordena índices por similaridade desc, ignora o próprio token
    sorted_idx = np.argsort(sims)[::-1]
    similar = [vocabulary[i] for i in sorted_idx if i != idx]
    return similar[:top_n]


# Coleta top 10 similares para cada token em cada representação
top_similar_bow = {tok: get_top_similar_tokens(bow_matrix, vocabulary_bow, word_to_index_bow, tok) 
                   for tok in vocabulary_bow}
top_similar_tfidf = {tok: get_top_similar_tokens(tfidf_matrix, vocabulary_tfidf, word_to_index_tfidf, tok) 
                     for tok in vocabulary_tfidf}
# top_similar_onehot = {tok: get_top_similar_tokens(identity_matrix, vocabulary_onehot, word_to_index_onehot, tok) 
#                        for tok in vocabulary_onehot}


# A similaridade foi calculada para cada token em cada representação
overall_similarity = {
    'bow': top_similar_bow,
    'tfidf': top_similar_tfidf,
    # 'onehot': top_similar_onehot
}

overall_similarity

{'bow': {'aaai': ['groups',
   'links',
   'generator',
   'vs',
   'explicit',
   'bleu',
   'relied',
   'steer',
   'transformed',
   'finetuned'],
  'aae': ['kyriazi',
   'leahy',
   'hwu',
   'lavaud',
   'sudhakar',
   'suffer',
   'alpaca',
   'evoking',
   'vechtomova',
   'neo'],
  'aaron': ['suggestionignored',
   'molina',
   'owner',
   'studio',
   'stupid',
   'pallets',
   'aids',
   'carrie',
   'facepe',
   'captures'],
  'abacha': ['centered',
   'handled',
   'oncologists',
   'lems',
   'profiles',
   'xxl',
   'industries',
   'cal',
   'aforementioned',
   'phenotypes'],
  'abadi': ['softmax',
   'stillwell',
   'steinkraus',
   'carl',
   'marketing',
   'algorithmic',
   'carolina',
   'square',
   'martí',
   'categorizing'],
  'abadie': ['subtasks',
   'martinet',
   'hambro',
   'computationally',
   'pouget',
   'farley',
   'nash',
   'fake',
   'adjustable',
   'mirza'],
  'abbas': ['analytics',
   'equipment',
   'management',
   'goals',
   'powered',
  

In [59]:
# Print full vocabularies for inspection
print("\n=== Vocabulary: BoW ===")
print(vocabulary_bow.tolist())

print("\n=== Vocabulary: TF-IDF ===")
print(vocabulary_tfidf.tolist())

print("\n=== Vocabulary: One-Hot ===")
print(vocabulary_onehot)


=== Vocabulary: BoW ===

=== Vocabulary: TF-IDF ===

=== Vocabulary: One-Hot ===
