# 1) Importar bibliotecas

In [81]:
# Separar seções do dataset para treinar os modelos
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

# Obter uma representação vetorial a partir de um texto
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Redução de dimensionalidade
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Mostrar árvore de decisões
from sklearn.tree import plot_tree

# Mostrar visualmente a matrix de confusão
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# Modelos a serem treinados
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [82]:
import matplotlib.pyplot as plt
import pandas as pd

# Gerar a imagem contendo as palavras mais frequentes
from wordcloud import WordCloud

# Operações para baixar o dataset
from zipfile import ZipFile
from os import remove

# Filtrar stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
from string import punctuation
download('stopwords')
download('punkt')

# JSON
from json import dumps, loads

# Arquivos
from os import system
from os.path import isfile, exists

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Paulo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Paulo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 2) Criar funções auxiliares

## 2.1) Filtrar stopwords

In [83]:
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(punctuation)

# TODO Aplicar o NER

def filter_stopwords(word):
	word_tokens = word_tokenize(word)
	filtered_word = [x.lower() for x in word_tokens if x.lower() not in STOPWORDS]
	return ' '.join(filtered_word)

## 2.2) Representação Vetorial

In [84]:
vectorizer = TfidfVectorizer(
  stop_words=STOPWORDS,
  # max_features=50,
	lowercase=True
)

def TFIDF(word_list):
  tfidf = vectorizer.fit_transform(word_list)

  df_tfidf = pd.DataFrame(
    tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
  )

  return df_tfidf

In [85]:
def BERT(word_list):
	df_bert = word_list.copy()
	
	# TODO Importar e aplicar BERT no df_bert (tensorflow)

	return df_bert

In [86]:
count_vectorizer = CountVectorizer(
	lowercase=True, 
)

def BAG_OF_WORDS(word_list: pd.Series):
	bag = count_vectorizer.fit_transform(word_list)
	df_bag = pd.DataFrame(bag.toarray(), columns=count_vectorizer.get_feature_names_out())
	return df_bag

	
# BAG_OF_WORDS(df_dados['title'].head(5) + df_dados['title'].head(5))

## 2.3) Wordcloud

In [87]:
wordcloud = WordCloud(
	# stopwords=STOPWORDS,
	collocations=False,
	background_color='white',
	width=800,
	height=800
)

def show_wordcloud(wordlist):
	data = ' '.join(wordlist)
	wc = wordcloud.generate_from_text(data)

	plt.figure(figsize=(15, 15))
	plt.imshow(wc)
	plt.axis('off')
	plt.tight_layout()
	plt.show()

# 3) Preparar dados

## 3.1) Importar/Extrair dataset do arquivo .zip do kaggle

In [88]:
if not (exists('topic_classifier.csv') and isfile('topic_classifier.csv')):
	system('''!curl --silent -o out.zip "https://storage.googleapis.com/kaggle-data-sets/1115257/1873557/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220426%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220426T174557Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=0953e29a79852b0e242947618441064722538d9bd68fe6e8594ca7a7cd04a61959bb9a2aa8063a11b3edf02fcd690cdcf8be20428fc62251eedcf7cfbf9be23b6acd553e6cb9c96726a1750590013717b8ce194276693310dd318ae558b83d210e0123cdf96a320bd47843c2a0056608f9c07be4d1db8e06acdf3c32dfe201ef0df9503cbc91f8a711e4172f1e3904e0afd5ddee490f622c5af2e0f7f4166bcb17a00adec125995b8cfc0bf957cb0ab61dab36d701e7d3b1745dbd69b856fd79e19b97ba9e4d26526c6e6ce764119eef4a1ef7df305af6ceeb46b2849fefeff7c3d4d928a4a984596cb3361ae31f80ceedc054a5506b25ad8698f9e1f3435075"''')
	system('''f = ZipFile('out.zip', mode='r')''')
	system('''f.extractall()''')
	system('''f.close()''')
	system('''remove('out.zip')''')

In [89]:
dataset = pd.read_csv('topic_classifier.csv')
df_dados = dataset.sample(frac=1)
df_dados

Unnamed: 0,title,c1body,Topic
2803,RT by @tweetsauce: If we took all the gold tha...,If we took all the gold that has ever been min...,Science
1249,Sentdex Channel Update,Long time no video! https://nnfs.io Channel me...,Software Development
3919,"""Having a brain is like having a maniac walk t...",[link] [comments],Sprituality
3899,My one and only gripe with meditation advice.....,"Everyone always says to meditate everyday, and...",Sprituality
3038,first time i tried calling him stupid joke nam...,[link] [comments],Entertainment
...,...,...,...
634,PS5 Users Find DualSense Hidden Features...,Clip from Lew Later (iPhone 13 Gets Thicker......,News
693,Live breathwork session 31 october,"Reconnect and recreate. A live movement, breat...",Wellness
5696,When Historical Fiction Is a Crime (2020),Comments,News
4176,Beautiful and Pure Human Goodness.,[link] [comments],Entertainment


## 3.2) Filtrar stopwords

In [90]:
df_dados['title'] = df_dados['title'].apply(lambda x: filter_stopwords(str(x)))
df_dados['c1body'] = df_dados['c1body'].apply(lambda x: filter_stopwords(str(x)))

## 3.3) Separar variáveis de features e target

In [91]:
y_true = df_dados['Topic']
X = df_dados.drop(columns=['Topic'])

## 3.4) Salvar notícias em arquivos separados

In [92]:
!mkdir news

for topico in y_true.unique():
	if exists(f'news/{topico}.json'):
		continue

	noticias = df_dados[df_dados['Topic'] == topico]

	with open(f'news/{topico}.json', mode='w') as f:
		f.write(dumps(noticias.to_json()))	

A subdirectory or file news already exists.


Exception: uga

# 4) Visualizações

## 4.1) Redução de dimensionalidade com representações vetoriais

In [None]:
noticias = sorted(y_true.unique().tolist())
cores = ['red', 'orange', 'green', 'blue', 'purple', 'gray', 'brown', 'cyan']

In [None]:
def representacao_vetorial(x):
	return TFIDF(x)
	return BAG_OF_WORDS(x)

def reducao_dimensionalidade(x):
	return TSNE( n_components=2, init='random', learning_rate='auto').fit_transform(x)
	return PCA( n_components=2,).fit_transform(x)

In [None]:
plt.figure(figsize=(20, 10))

for i in range(len(noticias)):
	noticia = noticias[i]
	dados = df_dados[df_dados['Topic'] == noticia]

	# Transformar texto pra número
	dados_representacao_vetorial = representacao_vetorial(dados['title'])
	# dados_representacao_vetorial = representacao_vetorial(dados['c1body'])
	# dados_representacao_vetorial = representacao_vetorial(dados['title'] + dados['c1body'])

	# Reduzir dimensionalidade
	dados_reducao_dimensionalidade = reducao_dimensionalidade(dados_representacao_vetorial)

	print(noticia)
	plt.scatter(
		[y[0] for y in dados_reducao_dimensionalidade],
		[y[1] for y in dados_reducao_dimensionalidade],
		color=cores[i],
		alpha=0.5,
		label=noticia
	)

plt.title('TSNE + BOW - Títulos')
plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.legend()
plt.show()

## 4.2) Distribuição dos dados por tópico

In [None]:
y_true_plot_data = y_true.value_counts()
plt.figure(figsize=(20, 8))
plt.bar(y_true_plot_data.keys(), y_true_plot_data.values)
plt.show()

# Sugestões
# SMOTE para oversampling
# 1 classificador para cada target
# Rodar vários modelos

## 4.3) Wordcloud

In [None]:
show_wordcloud(X['title'])
show_wordcloud(X['c1body'])
show_wordcloud(X['title'] + X['c1body'])

# 5) Separação de dados para treino/teste

In [None]:
df_representacao_vetorial = representacao_vetorial(X['title'])
# df_representacao_vetorial = representacao_vetorial(X['c1body'])
# df_representacao_vetorial = representacao_vetorial(X['title'] + X['c1body'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_representacao_vetorial, y_true, test_size=0.3)
print('X_train', len(X_train))
print('y_train', len(y_train))
print('X_test', len(X_test))
print('y_test', len(y_test))

In [None]:
def model():
	return LogisticRegression()
	return RandomForestClassifier()
	return DecisionTreeClassifier()

## 5.1) Treino e teste previamente separados

In [None]:
m = model()
m.fit(X_train, y_train)

In [None]:
score = m.score(X_test, y_test)
score

## 5.2) Cross-Validation

In [None]:
scores = cross_val_score(model(), df_representacao_vetorial, y_true, cv=10)
scores

KeyboardInterrupt: 

## 5.3) Matrix de confusão

In [None]:
y_pred = cross_val_predict(model(), df_representacao_vetorial, y_true)

In [None]:
cm = confusion_matrix(y_true, y_pred)
f = ConfusionMatrixDisplay(
	cm,
	display_labels=y_true.unique()
)
fig, ax = plt.subplots(figsize=(15, 15))
f.plot(ax=ax, xticks_rotation='vertical')
plt.title('Regressão Logística - TFIDF - Títulos')
plt.show()