In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("dataset/train.csv", sep = ";")
df_test = pd.read_csv("dataset/test.csv", sep = ";")
df_eval = pd.read_csv("dataset/evaluation.csv", sep = ";")
titles_first = np.array(pd.concat([df_train, df_eval, df_test]).dropna()["title"].tolist())

In [3]:
df_true = pd.read_csv("dataset/True.csv")
df_fake = pd.read_csv("dataset/Fake.csv")
titles_second = np.array(pd.concat([df_true, df_fake]).dropna()["title"].tolist())

In [4]:
titles = np.concatenate((titles_first, titles_second), axis=0)

In [5]:
titles.shape

(85485,)

In [6]:
filter_vocab = set(sorted([
    ' ', '!', '"', '#', '$',
    '%','&', "'", '(', ')',
    '*', '+', ',', '-','.',
    '/', '0', '1', '2', '3',
    '4', '5', '6', '7', '8',
    '9', ':', ';', '=', '?',
    '@', 'A', 'B', 'C', 'D',
    'E', 'F', 'G', 'H', 'I',
    'J', 'K', 'L', 'M', 'N',
    'O', 'P', 'Q', 'R', 'S',
    'T', 'U', 'V', 'W', 'X',
    'Y', 'Z', '[', ']', '_',
    'a', 'b', 'c', 'd', 'e',
    'f', 'g', 'h', 'i', 'j',
    'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't',
    'u', 'v', 'w', 'x', 'y',
    'z', '{', '}', '“', '”',
    "<pad>"
]))

In [7]:
def filter_chars(text: str) -> str:
    global filter_vocab
    return "".join([c if c in filter_vocab else "" for c in text])

In [8]:
titles = np.array([filter_chars(title) for title in titles])
titles.shape

(85485,)

# Titles filtering

In [9]:
import statistics
statistics.mean(map(len, titles))

77.67744048663508

In [10]:
TITLE_LEN: int = 100

In [11]:
def is_long_text(text: str) -> bool:
    return len(text) >= TITLE_LEN
is_long = np.vectorize(is_long_text)

In [12]:
long_titles = titles[is_long(titles)]
long_titles.shape

(11402,)

In [13]:
%reset -f

# Text filtering

In [14]:
import pandas as pd
import numpy as np

In [15]:
df_train = pd.read_csv("dataset/train.csv", sep = ";")
df_test = pd.read_csv("dataset/test.csv", sep = ";")
df_eval = pd.read_csv("dataset/evaluation.csv", sep = ";")
text_first = np.array(pd.concat([df_train, df_eval, df_test]).dropna()["text"].tolist())

In [16]:
def filter_chars(text: str) -> str:
    global filter_vocab
    return "".join([c if c in filter_vocab else "" for c in text])

In [17]:
import statistics
statistics.mean(map(len, text_first))

2482.4657402616604

In [18]:
MAX_TEXT_LEN: int = 150

In [19]:
def shrink_text(text: str) -> str:
    wrds = text.split(" ")
    retVal = ""
    for wrd in wrds:
        if len(retVal + " " + wrd) > MAX_TEXT_LEN - 1:
            return retVal.strip() + "." 
        else:
            retVal += " " + wrd
shrink = np.vectorize(shrink_text)

In [20]:
filter_vocab = set(sorted([
    ' ', '!', '"', '#', '$',
    '%','&', "'", '(', ')',
    '*', '+', ',', '-','.',
    '/', '0', '1', '2', '3',
    '4', '5', '6', '7', '8',
    '9', ':', ';', '=', '?',
    '@', 'A', 'B', 'C', 'D',
    'E', 'F', 'G', 'H', 'I',
    'J', 'K', 'L', 'M', 'N',
    'O', 'P', 'Q', 'R', 'S',
    'T', 'U', 'V', 'W', 'X',
    'Y', 'Z', '[', ']', '_',
    'a', 'b', 'c', 'd', 'e',
    'f', 'g', 'h', 'i', 'j',
    'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't',
    'u', 'v', 'w', 'x', 'y',
    'z', '{', '}', '“', '”',
    "<pad>"
]))

In [21]:
text_first_shrink = shrink(text_first)

In [22]:
texts_filtered = np.array([filter_chars(text) for text in text_first_shrink])
texts_filtered.shape

(40587,)

In [23]:
np.save("dataset/text_shrink.npy", texts_filtered[:30000])

# Text filtering other dataset

In [2]:
import pandas as pd
import numpy as np

In [4]:
df_true = pd.read_csv("dataset/True.csv")
df_fake = pd.read_csv("dataset/Fake.csv")
text_first = np.array(pd.concat([df_true, df_fake]).dropna()["text"].tolist())

In [5]:
def filter_chars(text: str) -> str:
    global filter_vocab
    return "".join([c if c in filter_vocab else "" for c in text])

In [6]:
import statistics
statistics.mean(map(len, text_first))

2469.1096930820972

In [7]:
MAX_TEXT_LEN: int = 150

In [8]:
def shrink_text(text: str) -> str:
    wrds = text.split(" ")
    retVal = ""
    for wrd in wrds:
        if len(retVal + " " + wrd) > MAX_TEXT_LEN - 1:
            return retVal.strip() + "." 
        else:
            retVal += " " + wrd
shrink = np.vectorize(shrink_text)

In [9]:
filter_vocab = set(sorted([
    ' ', '!', '"', '#', '$',
    '%','&', "'", '(', ')',
    '*', '+', ',', '-','.',
    '/', '0', '1', '2', '3',
    '4', '5', '6', '7', '8',
    '9', ':', ';', '=', '?',
    '@', 'A', 'B', 'C', 'D',
    'E', 'F', 'G', 'H', 'I',
    'J', 'K', 'L', 'M', 'N',
    'O', 'P', 'Q', 'R', 'S',
    'T', 'U', 'V', 'W', 'X',
    'Y', 'Z', '[', ']', '_',
    'a', 'b', 'c', 'd', 'e',
    'f', 'g', 'h', 'i', 'j',
    'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't',
    'u', 'v', 'w', 'x', 'y',
    'z', '{', '}', '“', '”',
    "<pad>"
]))

In [10]:
text_first_shrink = shrink(text_first)

In [11]:
texts_filtered = np.array([filter_chars(text) for text in text_first_shrink])
texts_filtered.shape

(44898,)

In [12]:
np.save("dataset/text_shrink_other.npy", texts_filtered[:30000])