In [16]:
%matplotlib inline
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
from cleantext import clean
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def clean_text(text):
    """Calls the clean function from cleantext clean on a string"""
    t = clean(text,
    fix_unicode=False,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,         # fully strip line breaks as opposed to only normalizing them NOT WORKING?
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="URL",
    replace_with_email="EMAIL",
    replace_with_phone_number="PHONE",
    replace_with_number="NUMBER",
    replace_with_digit="0",
    replace_with_currency_symbol="CUR",
    lang="en"                       # set to 'de' for German special handling
    )
    return t


print("Getting csv...")
pd.set_option("display.max_colwidth", 10000000)
data = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv')


print("Getting headers..")
column_names = list(data.columns.values)


print("Cleaning...")
for index, row in data.iterrows():
    cleaned = clean_text(row['content'])
    data.at[index,'content'] = cleaned
    

len_before_process = sum(map(len, data.content))
print(len_before_process)
    
    
print("Tokenizing and removing stopwords...")
stop_words = set(stopwords.words('english'))
for index, row in data.iterrows():    
    data.at[index,'content'] = nltk.word_tokenize(row['content'])
    
    filtered_sentence = []
    for w in data.at[index,'content']:
        if w not in stop_words:
            filtered_sentence.append(w)
    
    data.at[index,'content'] = filtered_sentence

    
len_after_process = sum(map(len, data.content))
print(len_after_process)
    


    
    
print("Done...")









[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonmork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Getting csv...
Getting headers..
Cleaning...
1037899
Tokenizing and removing stopwords...
122049
Done...


In [None]:
#tips: nearest neighbor on missing data