In [1]:
import pandas as pd

In [2]:
df_news = pd.read_pickle("data/data_raw.pickle")

### Cleaning up

In [3]:
before = df_news.shape[0]

to_drop = ["","[]"]
for col in df_news.columns:
    if col != "ID":
        for item in to_drop:
            df_news = df_news[df_news[col] != item]
        df_news = df_news[df_news[col].notnull()]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:  19076
Rows left: 185059


### Keeping only first ten sentences of "Content"

In [2]:
import re
from functools import reduce

In [5]:
def keep_10(row):
    """Only keep first 10 sentences of Content
    *args: pandas row object
    return: string
    """
    return reduce(lambda i,j: i + j, re.split(r'(\.\s)', row ,10)[:20])

df_news["Content"] = df_news["Content"].apply(keep_10)

### Lowercasing and Tokenizing

In [3]:
import nltk
import time
from IPython.display import clear_output
# Only when first using nltk
#nltk.download()

In [7]:
df_news["Headline"] = df_news["Headline"].str.lower()
df_news["Content"] = df_news["Content"].str.lower()

In [8]:
tokenize_start = time.time()
df_news["Headline"] = df_news["Headline"].apply(nltk.word_tokenize)
tokenize_end = time.time()
print("Headline done. Took {0:2.0f} minutes.".format((tokenize_end - tokenize_start)/60))

tokenize_start = time.time()
df_news["Content"] = df_news["Content"].apply(nltk.word_tokenize)
tokenize_end = time.time()
print("Done. Took {0:2.0f} minutes.".format((tokenize_end - tokenize_start)/60))

Headline done. Took  0 minutes.
Done. Took  7 minutes.


In [14]:
#df_news.to_pickle("data/data_preprocessed.pickle")

In [4]:
df_news = pd.read_pickle("data/data_preprocessed.pickle")

### Dropping Headlines > 25 tokens

In [9]:
def headline_gt_25(item):
    """
    rows with more than 25 tokens in headline
    
    *args: row
    return: row
    """
    if len(item) > 25: return True
    else: return False

df_news = df_news[df_news["Headline"].apply(headline_gt_25) == False]

### Shortening Content down to <= 50 tokens

# TODO
Check iloc[1]

In [11]:
def short_content(item):
    """
    Shortening content down to <= 50 tokens be aware of sentences.
    
    *args: content item list
    return: shortend content item list
    """
    if len(item) > 50:
        last_point = 0
        for i in range(len(item)):
            if item[i] == '.':
                last_point = i
            if i > 50:
                return item[:last_point]
    else:
        return item

df_news["Content"] = df_news["Content"].apply(short_content)

In [13]:
df_news.head()

Unnamed: 0,ID,Headline,Content
0,1,"[agent, cooper, in, twin, peaks, is, the, audi...","[and, never, more, so, than, in, showtime, ’, ..."
1,2,"[ai, ,, the, humanity, !]",[]
2,3,"[the, viral, machine]","[super, deluxe, built, a, weird, internet, emp..."
3,4,"[how, anker, is, beating, apple, and, samsung,...","[steven, yang, quit, his, job, at, google, in,..."
4,5,"[tour, black, panther, ’, s, reimagined, homel...","[ahead, of, black, panther, ’, s, 2018, theatr..."
