In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import json

In [3]:
# import raw data
raw = pd.read_csv(".\\raw_data\\News_dataset\\Fake.csv")

In [4]:
# get all the text
text = " ".join([text for text in raw["text"]])

In [20]:
# tokenize text
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(text)
# make lower
tokens = [token.lower() for token in tokens]
# remove stop words
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
# remove names
names = set(["trump", 'hillary', 'clinton', 'realdonaldtrump', 'https', '000', 'donald', '2', 'obama', '1'])
tokens = [token for token in tokens if token not in names]

In [21]:
# stem
stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokens]

In [22]:
unique_tokens = set(tokens)

In [23]:
counts = {word:0 for word in unique_tokens}

In [24]:
for word in tokens:
    counts[word] += 1

In [25]:
counts

{'labors': 6,
 'arrears': 3,
 'famously': 157,
 'rounded': 42,
 'ssa': 48,
 'ebrahim': 2,
 'cycling': 7,
 'fo': 7,
 'hajari': 1,
 'mskristinawong': 28,
 'rosneft': 7,
 'ffl': 8,
 'jozsef': 2,
 'anarachists': 1,
 'rouses': 2,
 'scania': 1,
 'accompaniment': 1,
 'independentread': 2,
 'snag': 14,
 'scan': 29,
 'klaas': 7,
 '4rig5cvuaj': 1,
 'callerkallstrom': 3,
 'flagrantly': 15,
 'pretending': 195,
 'violated': 337,
 'tja6yllman': 1,
 'explaining': 392,
 'shohwpydw2': 1,
 'insubstantial': 1,
 'smuggler': 6,
 'jayda': 12,
 '2016chris': 1,
 'winterstorm': 1,
 'unseemliness': 1,
 '0q8bv3cfoq': 1,
 'ninepins': 2,
 'interconnections': 1,
 'racks': 4,
 'psonick': 2,
 'mkues65': 3,
 'faden': 1,
 'scholarship': 68,
 'spacious': 4,
 'interchangeably': 5,
 'hahahahahahahhahahahahhahahaahhahahahahahahhahahahaha': 1,
 'condescended': 3,
 'zelevansky': 1,
 'infinitum': 1,
 'channelview': 1,
 'cretz': 1,
 'owner': 652,
 'h1b': 9,
 'leonn': 2,
 'stirred': 29,
 'largely': 551,
 'consummation': 3,
 'nc

In [26]:
top_vals = [key for key in counts.values()]

In [27]:
top_vals.sort(reverse=True)
top_vals = set(top_vals[:200])

In [28]:
top_200 = {k:v for k,v in counts.items() if v in top_vals}

In [29]:
top_200_list = []
for k,v in top_200.items():
    top_200_list.append({"word":k, "count":v})

In [30]:
final = {"vocab": top_200_list}
final

{'vocab': [{'word': 'even', 'count': 14011},
  {'word': 'change', 'count': 3739},
  {'word': 'could', 'count': 10246},
  {'word': 'fact', 'count': 6279},
  {'word': 'president', 'count': 27715},
  {'word': 'national', 'count': 7209},
  {'word': 'presidential', 'count': 5744},
  {'word': 'campaign', 'count': 11107},
  {'word': 'every', 'count': 6101},
  {'word': 'including', 'count': 4503},
  {'word': 'like', 'count': 18097},
  {'word': 'show', 'count': 6143},
  {'word': 'money', 'count': 4846},
  {'word': 'asked', 'count': 5145},
  {'word': 'come', 'count': 5062},
  {'word': 'police', 'count': 9110},
  {'word': 'nothing', 'count': 4271},
  {'word': 'told', 'count': 9122},
  {'word': 'another', 'count': 6489},
  {'word': 'american', 'count': 11319},
  {'word': 'candidate', 'count': 4437},
  {'word': 'press', 'count': 4625},
  {'word': 'real', 'count': 3900},
  {'word': 'need', 'count': 5086},
  {'word': 'world', 'count': 7428},
  {'word': 'take', 'count': 6845},
  {'word': 'cnn', 'count

In [35]:
with open("fake_news_counts_nonames.json", "w") as o:
    o.write(json.dumps(top_200))

In [32]:
top_200

{'even': 14011,
 'change': 3739,
 'could': 10246,
 'fact': 6279,
 'president': 27715,
 'national': 7209,
 'presidential': 5744,
 'campaign': 11107,
 'every': 6101,
 'including': 4503,
 'like': 18097,
 'show': 6143,
 'money': 4846,
 'asked': 5145,
 'come': 5062,
 'police': 9110,
 'nothing': 4271,
 'told': 9122,
 'another': 6489,
 'american': 11319,
 'candidate': 4437,
 'press': 4625,
 'real': 3900,
 'need': 5086,
 'world': 7428,
 'take': 6845,
 'cnn': 4731,
 'women': 7396,
 'year': 10221,
 'public': 6775,
 'read': 4249,
 'put': 4421,
 'image': 9895,
 'city': 4431,
 'getty': 4141,
 'officials': 3933,
 'watch': 6602,
 'make': 9254,
 'called': 6431,
 'family': 4765,
 'may': 7696,
 'interview': 4095,
 'statement': 4114,
 'well': 8447,
 'united': 8011,
 'policy': 3811,
 'speech': 4039,
 'million': 5144,
 'washington': 5737,
 'featured': 8260,
 'general': 3744,
 'media': 11704,
 'political': 7654,
 'life': 4022,
 'ever': 4373,
 'go': 6922,
 'vote': 5764,
 'see': 7468,
 '2017': 4461,
 'investi