#Eric Azevedo de Oliveira

#AS02: Representação Textual


In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer , TfidfTransformer
from sklearn.feature_extraction import text
from pprint import pprint
import numpy as np
import re
import sklearn
import pandas as pd
import nltk
from gensim.models import Word2Vec
import spacy
from collections import defaultdict
newsgroups_train = fetch_20newsgroups(subset='train')
C = list(newsgroups_train.target_names)
C

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
T =['The who is the band!',
    'who is the band?',
    'The band who plays the who.']

In [None]:
def get_tokens(text):
    tokens = re.sub("[^\w]", " ", text).split()
    cleaned_text = [w.lower() for w in tokens]
    return cleaned_text


def tokenize(texts):
    words = []
    for text in texts:
        w = get_tokens(text)
        words.extend(w)
    words = sorted(list(set(words)))
    return words


V = tokenize(C)


#One-Hot Encoding

In [None]:
with open("20News_01.txt", "w") as f:
  for i, text in enumerate(C):
    words = get_tokens(text)
    bag_vector = np.zeros(len(V))
    for w in words:
        for j, word in enumerate(V):
            if word == w:
                bag_vector[j] = 1
    f.write("{0} = {1}\n".format(text, bag_vector))

#Count Vectors

In [None]:
def preprocessCorpus(corpus):
    new_corpus = [doc.lower() for doc in corpus]
    regex = r"(?<!\d)[\!\?.,;:-](?!\d)"
    new_corpus = [re.sub(regex, "", doc, 0) for doc in new_corpus]
    return new_corpus


In [None]:
corpus = preprocessCorpus(C)
print(corpus)

['altatheism', 'compgraphics', 'composmswindowsmisc', 'compsysibmpchardware', 'compsysmachardware', 'compwindowsx', 'miscforsale', 'recautos', 'recmotorcycles', 'recsportbaseball', 'recsporthockey', 'scicrypt', 'scielectronics', 'scimed', 'scispace', 'socreligionchristian', 'talkpoliticsguns', 'talkpoliticsmideast', 'talkpoliticsmisc', 'talkreligionmisc']


In [None]:
vectorizer = CountVectorizer()
doc_term_matrix = vectorizer.fit_transform(corpus)
terms = vectorizer.get_feature_names_out()
df = pd.DataFrame(doc_term_matrix.toarray(), columns=terms)
with open("20News_02.txt", "w") as f:
    print(df.to_string(), file=f)

#TF-IDF

In [None]:
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(doc_term_matrix)
df = pd.DataFrame(tfidf_matrix.A, columns=terms)
with open("20News_03.txt", "w") as f:
    print(df.to_string(), file=f)

#n-grams (2-grams)

In [None]:
vectorizer= CountVectorizer(ngram_range=(2,2))
doc_term_matrix = vectorizer.fit_transform(corpus)
vocabulary=vectorizer.get_feature_names_out()
df = pd.DataFrame(doc_term_matrix.A, columns=vocabulary)
with open("20News_04.txt", 'w') as f:
    f.write(df.to_string())

#Co-occurrence Vectors (Context Window = 1)

In [None]:
co_occurrence_matrix = (doc_term_matrix * doc_term_matrix.T)
print(co_occurrence_matrix.todense())

[[4 3 2]
 [3 3 1]
 [2 1 5]]


In [None]:
g = sp.diags(1. / co_occurrence_matrix.diagonal())
co_occurrence_matrix_norm = g * co_occurrence_matrix

In [None]:
def co_ocurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()

    for text in sentences:
        text = text.lower().split()
        for i, token in enumerate(text):
            vocab.add(token)
            next_tokens = text[i+1:i+window_size+1]

            for t in next_tokens:
                key = tuple(sorted([t, token]))
                d[key] += 1

    vocab = sorted(vocab)
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)

    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value

    return df

In [None]:
df=co_ocurrence(corpus,2)
df.head()

Unnamed: 0,altatheism,compgraphics,composmswindowsmisc,compsysibmpchardware,compsysmachardware,compwindowsx,miscforsale,recautos,recmotorcycles,recsportbaseball,recsporthockey,scicrypt,scielectronics,scimed,scispace,socreligionchristian,talkpoliticsguns,talkpoliticsmideast,talkpoliticsmisc,talkreligionmisc
altatheism,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
compgraphics,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
composmswindowsmisc,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
compsysibmpchardware,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
compsysmachardware,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
with open("20News_05.txt", 'w') as f:
    f.write(df.to_string())

#Word2Vec

In [None]:
nlp=spacy.load('en_core_web_sm')
bag_vector=[nlp(sentence).vector for sentence in corpus]
with open('20News_06.txt', 'w') as f:
    for vec in bag_vector:
        vec_str = ' '.join(map(str, vec))
        f.write(f"{vec_str}\n")