In [1]:
import pandas as pd
import numpy as np
import re, sys, json
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

import matplotlib.pyplot as plt
from collections import defaultdict
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [15]:
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words = stopwords.words('english')
stop_words_br = stopwords.words('portuguese')

[nltk_data] Downloading package punkt to /home/vagner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/vagner/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [21]:
def pre_process_en(txt):
    txt = txt.lower() #All to lower
    txt = re.sub(r'\W+', ' ', txt) #Remove special chars
    tokens = word_tokenize(txt) #Tokenizing text
    tokens = [w for w in tokens if w not in stop_words] # Removing stopwords    
    tokens = [w for w in tokens if re.match(r'[a-z]+$', w) != None]
    
    txt = ' '.join(tokens)
    if len(txt) == 0:
        return None

    return txt

def pre_process_br(txt):
    txt = txt.lower() #All to lower
    txt = re.sub(r'\W+', ' ', txt) #Remove special chars
    tokens = word_tokenize(txt) #Tokenizing text
    tokens = [w for w in tokens if w not in stop_words_br] # Removing stopwords    
    tokens = [w for w in tokens if re.match(r'[a-z]+$', w) != None]
    
    txt = ' '.join(tokens)
    if len(txt) == 0:
        return None

    return txt

def train_test_val_split(df):
    train_df, temp_df = train_test_split(df, test_size=0.4, random_state=666)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=666)
    return train_df, val_df, test_df

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)   

In [6]:
# Loading
data_dir = '../preprocessed_datasets'
dataset_dir = f'{data_dir}/hotels_reviews'

df = pd.read_csv(f'{dataset_dir}/Hotel_Reviews.csv')
df_train, df_val, df_test = train_test_val_split(df)
df_train['partition'] = 'train'
df_val['partition'] = 'val'
df_test['partition'] = 'test'

# Combining
combined_df = pd.concat([df_train, df_val, df_test])[['reviews.text', 'partition', 'reviews.rating']].dropna()
combined_df['txt'] = combined_df['reviews.text'].apply(pre_process_en)
combined_df['label'] = combined_df['reviews.rating'].apply(int)
corpus_df = combined_df[['txt', 'partition', 'label']].dropna().reset_index(drop=True)
labels = sorted(combined_df['label'].unique())

#Saving
corpus_df.to_csv(f'{dataset_dir}/corpus.tsv', sep='\t', index=False, header=False)
vocab = sorted({w for s in corpus_df.txt for w in s.split(' ')})
open(f'{dataset_dir}/vocabulary.txt', 'w+').write('\n'.join(vocab))

metadata = {
    "total_documents": len(corpus_df),
    "vocabulary_length": len(vocab),
    "preprocessing-info": [],
    "labels": labels,
    "total_labels": len(labels),
    "last-training-doc": np.where(corpus_df.partition == 'train')[0][-1],
    "last-validation-doc": np.where(corpus_df.partition == 'val')[0][-1]
}
meta_raw = json.dumps(metadata, cls=NpEncoder, indent=4)
open(f'{dataset_dir}/metadata.json', 'w+').write(meta_raw)

328

In [24]:
# Loading
data_dir = '../preprocessed_datasets'
dataset_dir = f'{data_dir}/BRNews'

df = pd.read_csv(f'{dataset_dir}/pre-processed.csv').reset_index(drop=True)[:5000]
df

Unnamed: 0,index,label,preprocessed_news
0,0,fake,katia abreu diz vai colocar expulsao moldura n...
1,1,fake,ray peita bolsonaro conservador fake entrevist...
2,2,fake,reinaldo azevedo desmascarado policia federal ...
3,3,fake,relatorio assustador bndes mostra dinheiro pub...
4,4,fake,radialista americano fala sobre pt vendem ilus...
...,...,...,...
4995,4995,true,cao acha drogas dentro fogao durante megaopera...
4996,4996,true,determinacao diminuicao vazao represa caconde ...
4997,4997,true,reforma previdencia governo pode suspender abo...
4998,4998,true,zuckerberg diz facebook vai atuar garantir int...


In [25]:
df_train, df_val, df_test = train_test_val_split(df)
df_train['partition'] = 'train'
df_val['partition'] = 'val'
df_test['partition'] = 'test'

# Combining
combined_df = pd.concat([df_train, df_val, df_test])[['preprocessed_news', 'partition', 'label']].dropna()
combined_df['txt'] = combined_df['preprocessed_news'].apply(pre_process)
#combined_df['label'] = combined_df['reviews.rating'].apply(int)
corpus_df = combined_df[['txt', 'partition', 'label']].dropna().reset_index(drop=True)
labels = sorted(combined_df['label'].unique())
corpus_df

Unnamed: 0,txt,partition,label
0,gol pagara r mi acidente transformou area casa...,train,true
1,apos demitida globo atriz desabafa golpe pense...,train,fake
2,suposto americano diz michael jackson assassin...,train,fake
3,completar quatro anos operacao lava jato curit...,train,true
4,presidente senado renan calheiros afirmou nest...,train,true
...,...,...,...
4995,farsa lewandowski ja sabia fatiamento impeachm...,test,fake
4996,imoveis senadores aecio zeze perrella bh sao a...,test,true
4997,dilma nega recebeu r milhoes odebrecht diz vai...,test,fake
4998,juiz federal sergio moro operacao lava jato se...,test,true


In [26]:
#Saving
corpus_df.to_csv(f'{dataset_dir}/corpus.tsv', sep='\t', index=False, header=False)
vocab = sorted({w for s in corpus_df.txt for w in s.split(' ')})
open(f'{dataset_dir}/vocabulary.txt', 'w+').write('\n'.join(vocab))

metadata = {
    "total_documents": len(corpus_df),
    "vocabulary_length": len(vocab),
    "preprocessing-info": [],
    "labels": labels,
    "total_labels": len(labels),
    "last-training-doc": np.where(corpus_df.partition == 'train')[0][-1],
    "last-validation-doc": np.where(corpus_df.partition == 'val')[0][-1]
}
meta_raw = json.dumps(metadata, cls=NpEncoder, indent=4)
open(f'{dataset_dir}/metadata.json', 'w+').write(meta_raw)

234