In [1]:
#dataset is at ../liar_dataset/liar.csv
DATASET_PATH = '../liar_dataset/liar prepped.csv'
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
dataset = pd.read_csv(DATASET_PATH, index_col=['id'])
dataset

Unnamed: 0_level_0,label,prepped statement
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,build wall u mexico border take literally year
1,1,wisconsin pace double number layoff year
2,1,john mccain do nothing help vet
3,0,suzanne bonamici support plan cut choice medic...
4,1,ask reporter whether he center criminal scheme...
...,...,...
12804,0,first time decade import account less half oil...
12805,0,donald trump bankrupt company twice four time
12806,0,john mccain george bush absolutely plan univer...
12807,1,new poll show 62 percent support president pla...


In [4]:
dataset['label'].value_counts()

label
0    7155
1    5654
Name: count, dtype: int64

In [5]:
corpus = dataset['prepped statement'].values
corpus

array(['build wall u mexico border take literally year',
       'wisconsin pace double number layoff year',
       'john mccain do nothing help vet', ...,
       'john mccain george bush absolutely plan universal health care',
       'new poll show 62 percent support president plan reform health care mean let choose keep private insurance public health insurance plan',
       'one claim report vindicate new jersey gov chris christie bridge scandal conclusive'],
      dtype=object)

In [6]:
len(corpus)

12809

In [7]:
#apply vectorizer: Bag of Words, and create a dataframe from the matrix
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names_out()
dfBow = pd.DataFrame(matrix.todense(), columns=words)
dfBow

Unnamed: 0,00,000,000new,014,02,024,029,033,036,04,...,zimmerman,zinn,zip,zippo,zombie,zone,zoo,zuckerberg,zuckerbergs,ʺmore
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12805,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12807,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#get only the columns with words
wordColumns = [word for word in dfBow.columns if word.isalpha() and len(word) >= 3]
len(wordColumns)

9067

In [9]:
dfBow[wordColumns]

Unnamed: 0,aaa,aaron,aarp,aba,abandon,abbas,abbott,abc,abdul,abedin,...,zimmerman,zinn,zip,zippo,zombie,zone,zoo,zuckerberg,zuckerbergs,ʺmore
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12805,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12807,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
result = pd.concat([dataset, dfBow[wordColumns]], axis=1)
result.drop(columns=['prepped statement'], inplace=True)
result

Unnamed: 0,label,aaa,aaron,aarp,aba,abandon,abbas,abbott,abc,abdul,...,zimmerman,zinn,zip,zippo,zombie,zone,zoo,zuckerberg,zuckerbergs,ʺmore
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12805,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12807,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
result.to_csv('../liar_dataset/liar BoW.csv', index_label='id')

In [49]:
dfBow = pd.read_csv('../liar_dataset/liar BoW.csv', index_col='id')
dfBow

Unnamed: 0_level_0,label,aaa,aaron,aarp,aba,abandon,abbas,abbott,abc,abdul,...,zimmerman,zinn,zip,zippo,zombie,zone,zoo,zuckerberg,zuckerbergs,ʺmore
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12805,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12807,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
wordCols = dfBow.columns.values[1:]
wordCols

array(['aaa', 'aaron', 'aarp', ..., 'zuckerberg', 'zuckerbergs', 'ʺmore'],
      dtype=object)

In [15]:
wordFrequency = dfBow[wordCols].sum(axis=0)
print(wordFrequency)

aaa             4
aaron           2
aarp            4
aba             1
abandon         4
               ..
zone           12
zoo             1
zuckerberg      1
zuckerbergs     1
ʺmore           1
Length: 9067, dtype: int64


In [16]:
wordFrequency.mean()

14.395500165435095

In [40]:
#from nltk.corpus import words
import nltk
nltk.download('words')

[nltk_data] Downloading package words to C:\Users\Ștefan
[nltk_data]     Vlădescu\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [25]:
#columns to be dropped: rare words, of frequency 1
#words that are not in the english dictionary
from nltk.corpus import words
#realWords = [word for word in wordFrequency.index if word in words.words()]
#print(len(realWords))
#rareWords = [word for word in wordFrequency.index if wordFrequency.at[word] <= 1]
#print(len(dropCols))
notRealWords = [word for word in wordFrequency.index.tolist() if word not in words.words()]
print(len(notRealWords))
rareWords = wordFrequency.index[wordFrequency == 1].tolist()
print(len(rareWords))

2757
3734


In [26]:
colsToDrop = notRealWords+rareWords
print(len(colsToDrop))

6491


In [28]:
colsToDrop = set(colsToDrop)
print(len(colsToDrop))

4884


By removing rare words and words that do not exist in the dictionary, we are down to 4184 variables, from over 12k

In [50]:
dfBow.drop(columns=colsToDrop, inplace=True)
dfBow

Unnamed: 0_level_0,label,abandon,abbas,abele,abide,ability,abject,able,aboard,abolish,...,yet,york,youd,young,youth,youve,zero,zip,zombie,zone
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12805,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12807,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
dfBow.to_csv('../liar_dataset/liar BoW.csv', index_label='id')

Spilt the dataframe into 2 for proper saving in repo

In [2]:
dfBow = pd.read_csv('../liar_dataset/liar BoW.csv', index_col=['id'])
dfBow.head(10)

Unnamed: 0_level_0,label,abandon,abbas,abele,abide,ability,abject,able,aboard,abolish,...,yet,york,youd,young,youth,youve,zero,zip,zombie,zone
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
dfBow.shape

(12809, 4184)

In [4]:
mid = dfBow.shape[0]//2
df1 = dfBow.iloc[:mid]
df2 = dfBow.iloc[mid:]

In [5]:
df1

Unnamed: 0_level_0,label,abandon,abbas,abele,abide,ability,abject,able,aboard,abolish,...,yet,york,youd,young,youth,youve,zero,zip,zombie,zone
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6401,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6402,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df2

Unnamed: 0_level_0,label,abandon,abbas,abele,abide,ability,abject,able,aboard,abolish,...,yet,york,youd,young,youth,youve,zero,zip,zombie,zone
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6404,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6405,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6406,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6407,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6408,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12805,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12807,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Save to two separate CSV files
df1.to_csv('../liar_dataset/liar BoW1.csv', index_label='id')
df2.to_csv('../liar_dataset/liar BoW2.csv', index_label='id')
dfBow.to_csv('../liar_dataset/liar BoW.csv', index_label='id')

In [11]:
#remerge dataframes into one
mergedDf = pd.concat([df1, df2])
mergedDf

Unnamed: 0_level_0,label,abandon,abbas,abele,abide,ability,abject,able,aboard,abolish,...,yet,york,youd,young,youth,youve,zero,zip,zombie,zone
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12805,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12807,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
