In [62]:
import pandas as pd
import numpy as np
import re, sys, json
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

import matplotlib.pyplot as plt
from collections import defaultdict
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# Spellchecker e stopwords

In [63]:
from spellchecker import SpellChecker

spell_br = SpellChecker(language='pt', distance=1)
spell_en = SpellChecker(distance=1)

def spell_check(tokens, spell):
    return list(filter(None, [spell.correction(t) for t in tokens]))

In [64]:
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words = stopwords.words('english')
stop_words_br = stopwords.words('portuguese')

[nltk_data] Downloading package punkt to /home/vagner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/vagner/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Funcoes de pre-processamento para diferentes idiomas

In [90]:
def pre_process_en(txt):
    txt = txt.lower() #All to lower
    txt = re.sub(r'\W+', ' ', txt) #Remove special chars
    #tokens = word_tokenize(txt) #Tokenizing text
    tokens = txt.split(' ')
    tokens = [w for w in tokens if w not in stop_words] # Removing stopwords    
    tokens = [w for w in tokens if re.match(r'[a-z]+$', w) != None]
    tokens = spell_check(tokens, spell_en)
    
    txt = ' '.join(tokens)
    #txt = txt.replace("don't", "no")
    #txt = txt.replace("'", ' ')
    if len(txt) == 0:
        return None

    return txt

def pre_process_br(txt):
    txt = txt.lower() #All to lower
    txt = re.sub(r'\W+', ' ', txt) #Remove special chars
    tokens = word_tokenize(txt) #Tokenizing text
    tokens = [w for w in tokens if w not in stop_words_br] # Removing stopwords    
    tokens = [w for w in tokens if re.match(r'[a-z]+$', w) != None]
    tokens = spell_check(tokens, spell_br)
    
    txt = ' '.join(tokens)
    if len(txt) == 0:
        return None

    return txt

def train_test_val_split(df):
    train_df, temp_df = train_test_split(df, test_size=0.4, random_state=666)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=666)
    return train_df, val_df, test_df

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)   

# Pre-Processamento Dataset Reviews de Hoteis

In [93]:
# Loading
print('loading data')

data_dir = '../preprocessed_datasets'
dataset_dir = f'{data_dir}/hotels_reviews'

df = pd.read_csv(f'{dataset_dir}/Hotel_Reviews.csv')
df_train, df_val, df_test = train_test_val_split(df)
df_train['partition'] = 'train'
df_val['partition'] = 'val'
df_test['partition'] = 'test'

print('combining data')

# Combining
combined_df = pd.concat([df_train, df_val, df_test])[['reviews.text', 'partition', 'reviews.rating']].dropna()
combined_df['txt'] = combined_df['reviews.text'].apply(pre_process_en)
combined_df['label'] = combined_df['reviews.rating'].apply(int)
corpus_df = combined_df[['txt', 'partition', 'label']].dropna().reset_index(drop=True)
labels = sorted(combined_df['label'].unique())

print('saving data')

#Saving
corpus_df.to_csv(f'{dataset_dir}/corpus.tsv', sep='\t', index=False, header=False)
vocab = sorted({w for s in corpus_df.txt for w in s.split(' ')})
open(f'{dataset_dir}/vocabulary.txt', 'w+').write('\n'.join(vocab))

metadata = {
    "total_documents": len(corpus_df),
    "vocabulary_length": len(vocab),
    "preprocessing-info": [],
    "labels": labels,
    "total_labels": len(labels),
    "last-training-doc": np.where(corpus_df.partition == 'train')[0][-1],
    "last-validation-doc": np.where(corpus_df.partition == 'val')[0][-1]
}
meta_raw = json.dumps(metadata, cls=NpEncoder, indent=4)
open(f'{dataset_dir}/metadata.json', 'w+').write(meta_raw)

loading data
combining data
saving data


328

# Pre-Processamento Dataset FakeNewsBR

In [54]:
# Loading
data_dir = '../preprocessed_datasets'
dataset_dir = f'{data_dir}/BRNews'

df = pd.read_csv(f'{dataset_dir}/pre-processed.csv').reset_index(drop=True)
df

Unnamed: 0,index,label,preprocessed_news
0,0,fake,katia abreu diz vai colocar expulsao moldura n...
1,1,fake,ray peita bolsonaro conservador fake entrevist...
2,2,fake,reinaldo azevedo desmascarado policia federal ...
3,3,fake,relatorio assustador bndes mostra dinheiro pub...
4,4,fake,radialista americano fala sobre pt vendem ilus...
...,...,...,...
7195,7195,true,jornal britanico acao contra lula lava jato se...
7196,7196,true,temer diz acionou pf cade investigar aumentos ...
7197,7197,true,obstaculos politicos temer especialistas ouvid...
7198,7198,true,setembro boa noite aqui estao principais notic...


In [57]:
df_train, df_val, df_test = train_test_val_split(df)
df_train['partition'] = 'train'
df_val['partition'] = 'val'
df_test['partition'] = 'test'

# Combining
combined_df = pd.concat([df_train, df_val, df_test])[['preprocessed_news', 'partition', 'label']].dropna()
combined_df['txt'] = combined_df['preprocessed_news'].apply(pre_process_br)
#combined_df['label'] = combined_df['reviews.rating'].apply(int)
corpus_df = combined_df[['txt', 'partition', 'label']].dropna().reset_index(drop=True)
labels = sorted(combined_df['label'].unique())
corpus_df

Unnamed: 0,txt,partition,label
0,antagonista dilema usa agência estatal sustent...,train,fake
1,fausto acedo brando fatio julga governador acr...,train,true
2,dia teatro acessível celebrado vez pais tanta ...,train,true
3,anunciam única categoria posicionou contra lad...,train,fake
4,medico afirma lula síndrome amnésia entra pedi...,train,fake
...,...,...,...
7195,frente frente juiz moro intima lula depor próx...,test,fake
7196,lava jato completa quatro anos sentenças serio...,test,true
7197,faria magistrada rosário anula decisão juiz fe...,test,fake
7198,apresentadora na sofre ataque hotel escapa a o...,test,fake


In [58]:
#Saving
corpus_df.to_csv(f'{dataset_dir}/corpus.tsv', sep='\t', index=False, header=False)
vocab = sorted({w for s in corpus_df.txt for w in word_tokenize(s)})
open(f'{dataset_dir}/vocabulary.txt', 'w+').write('\n'.join(vocab))

metadata = {
    "total_documents": len(corpus_df),
    "vocabulary_length": len(vocab),
    "preprocessing-info": [],
    "labels": labels,
    "total_labels": len(labels),
    "last-training-doc": np.where(corpus_df.partition == 'train')[0][-1],
    "last-validation-doc": np.where(corpus_df.partition == 'val')[0][-1]
}
meta_raw = json.dumps(metadata, cls=NpEncoder, indent=4)
open(f'{dataset_dir}/metadata.json', 'w+').write(meta_raw)

234