In [1]:
import pandas as pd

In [2]:
df_news = pd.read_pickle("data/data_raw.pickle")

### Cleaning up

In [4]:
before = df_news.shape[0]

to_drop = ["","[]"]
for col in df_news.columns:
    if col != "ID":
        for item in to_drop:
            df_news = df_news[df_news[col] != item]
        df_news = df_news[df_news[col].notnull()]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:  19076
Rows left: 185059


### Fixing interpunction misstakes

In [6]:
import re
from functools import reduce

In [7]:
def fix_interpunction(item):
    """
    Inserting space when missing after sentence ending.
    *args: item string
    return: fixed item string
    """
    match = re.search(r"[\w\d][\.\!\?][\w\d]",item)
    if match:
        item = item[:match.start()+2] + str(" ") + item[match.end()-1:]
        return fix_interpunction(item)
    else:
        return item

df_news["Content"] = df_news["Content"].apply(fix_interpunction)

### Keeping only first ten sentences of "Content"

In [9]:
def keep_10(row):
    """Only keep first 10 sentences of Content
    *args: pandas row object
    return: string
    """
    return reduce(lambda i,j: i + j, re.split(r'([\.\!\?]\s)', row ,10)[:20])

df_news["Content"] = df_news["Content"].apply(keep_10)

### Dropping rows where Headline occurs in Content

In [11]:
def headline_in_content(row):
    is_in = row["Content"].find(row["Headline"])
    if is_in == -1:
        return False
    else:
        return True
before = df_news.shape[0]

df_news = df_news[df_news.apply(headline_in_content, axis=1) == False]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:    840
Rows left: 184219


### Lowercasing and Tokenizing

In [13]:
import nltk
import time
from IPython.display import clear_output
# Only when first using nltk
#nltk.download()

In [14]:
df_news["Headline"] = df_news["Headline"].str.lower()
df_news["Content"] = df_news["Content"].str.lower()

In [15]:
tokenize_start = time.time()
df_news["Headline"] = df_news["Headline"].apply(nltk.word_tokenize)
tokenize_end = time.time()
print("Headline done. Took {0:2.0f} minutes.".format((tokenize_end - tokenize_start)/60))

tokenize_start = time.time()
df_news["Content"] = df_news["Content"].apply(nltk.word_tokenize)
tokenize_end = time.time()
print("Content Done. Took {0:2.0f} minutes.".format((tokenize_end - tokenize_start)/60))

Headline done. Took  0 minutes.
Content Done. Took  5 minutes.


### Dropping Headlines > 25 tokens

In [17]:
def headline_gt_25(item):
    """
    rows with more than 25 tokens in headline
    
    *args: row
    return: row
    """
    if len(item) > 25: return True
    else: return False

before = df_news.shape[0]

df_news = df_news[df_news["Headline"].apply(headline_gt_25) == False]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:    649
Rows left: 183570


In [18]:
#df_news.to_pickle("data/data_preprocessed.pickle")

#df_news = pd.read_pickle("data/data_preprocessed.pickle")

### Shortening Content down to last sentence having 50th token

## TODO
producing None values. ID = 104

In [19]:
def shorten_content(item):
    """
    Shortening content down to <= 50 tokens be aware of sentences.
    
    *args: content item list
    return: shortend content item list
    """
    if len(item) > 50:
        endings = []
        for i in range(len(item)):
            if item[i] == ('.' or '!' or '?'):
                endings.append(i)
            if endings:
                if endings[-1] > 49:
                    return item[:endings[-1]+1]
    else:
        return item

#df_news["Content"] = df_news["Content"].apply(shorten_content)

In [21]:
df_news[df_news["Content"].isna()].iloc[0]

ID                                                        104
Headline    [uber, is, recruiting, 50,000, veterans, as, d...
Content                                                  None
Name: 92, dtype: object