In [1]:
import pandas as pd

In [2]:
df_news = pd.read_pickle("data/data_raw.pickle")

### Cleaning up

In [3]:
before = df_news.shape[0]

to_drop = ["","[]"]
for col in df_news.columns:
    if col != "ID":
        for item in to_drop:
            df_news = df_news[df_news[col] != item]
        df_news = df_news[df_news[col].notnull()]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:  19076
Rows left: 185059


### Fixing interpunction misstakes

In [4]:
import re
from functools import reduce

In [5]:
def fix_interpunction(item):
    """
    Inserting space when missing after sentence ending.
    *args: item string
    return: fixed item string
    """
    match = re.search(r"[\w\d][\.\!\?][\w\d]",item)
    if match:
        item = item[:match.start()+2] + str(" ") + item[match.end()-1:]
        return fix_interpunction(item)
    else:
        return item

df_news["Content"] = df_news["Content"].apply(fix_interpunction)

### Keeping only first four sentences of "Content"

In [6]:
def keep_10(row):
    """Only keep first 5 sentences of Content
    *args: pandas row object
    return: string
    """
    return reduce(lambda i,j: i + j, re.split(r'([\.\!\?]\s)', row ,4)[:8])

df_news["Content"] = df_news["Content"].apply(keep_10)

### Replace lot of spaces with only one

In [7]:
def replace_spaces(string):
    """
    checking if there are mor then 2 concatenated spaces and replace it with one.
    *args: string
    return: string
    """
    return re.sub(r"\s{2,}"," ",string)

df_news["Content"] = df_news["Content"].apply(replace_spaces)

### Deleting not expected chars from content

In [8]:
def bad_format_1(string):
    """
    replace not expacted chars with space
    *args: content string
    return: string
    """
    return re.sub(r"[\\\,\'\s]{3,}(?=[A-Z]|\")"," ",string)

df_news["Content"] = df_news["Content"].apply(bad_format_1)

In [9]:
def bad_format_2(string):
    """
    deleting all backslashes
    *args: content-string
    return: string
    """
    return re.sub(r"\\{1,}","",string)

df_news["Content"] = df_news["Content"].apply(bad_format_2)

### Dropping rows the content has bunch of non letter chars

In [10]:
def non_char_bunch(string):
    """
    checking if string has any bunch of non letter chars in concatenation
    
    *args: string
    return: Bool
    """
    if re.search(r"(\\{1,}\'{1,})|(\\{2,})|(\{{1,})",string): return True
    else: return False

before = df_news.shape[0]

df_news = df_news[df_news["Content"].apply(non_char_bunch)==False]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:     74
Rows left: 184985


### Dropping rows the headline occurs in Content

In [11]:
def headline_in_content(row):
    is_in = row["Content"].find(row["Headline"])
    if is_in == -1:
        return False
    else:
        return True

before = df_news.shape[0]

df_news = df_news[df_news.apply(headline_in_content, axis=1) == False]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:    771
Rows left: 184214


### Dropping duplicate rows

In [12]:
before = df_news.shape[0]

df_news = df_news.drop_duplicates("Headline")

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:   7629
Rows left: 176585


### Dropping rows the content is snippet

In [13]:
def is_snippet(content):
    if content.find("... Continue") != -1: return True
    else: return False

before = df_news.shape[0]

df_news = df_news[df_news["Content"].apply(is_snippet)==False]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:     19
Rows left: 176566


### Lowercasing and Tokenizing

In [14]:
import nltk
import time
# Only when first using nltk
#nltk.download()

In [15]:
df_news["Headline"] = df_news["Headline"].str.lower()
df_news["Content"] = df_news["Content"].str.lower()

In [16]:
tokenize_start = time.time()
df_news["Headline"] = df_news["Headline"].apply(nltk.word_tokenize)
tokenize_end = time.time()
print("Headline done. Took {0:2.1f} minutes.".format((tokenize_end - tokenize_start)/60))

tokenize_start = time.time()
df_news["Content"] = df_news["Content"].apply(nltk.word_tokenize)
tokenize_end = time.time()
print("Content Done. Took {0:2.1f} minutes.".format((tokenize_end - tokenize_start)/60))

Headline done. Took 0.4 minutes.
Content Done. Took 2.8 minutes.


### Dropping rows the headline is > 25 tokens

In [17]:
def headline_gt_25(item):
    """
    rows with more than 25 tokens in headline
    
    *args: row
    return: row
    """
    if len(item) > 25: return True
    else: return False

before = df_news.shape[0]

df_news = df_news[df_news["Headline"].apply(headline_gt_25) == False]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:    588
Rows left: 175978


### Dropping rows the content is shorter then 25 tokens

In [18]:
before = df_news.shape[0]
df_news = df_news[df_news["Content"].apply(lambda x: len(x)) > 25]
after = df_news.shape[0]

print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:   1952
Rows left: 174026


### Dropping rows the > 4 last tokens are non character

In [19]:
def bad_format(list):
    """
    giving true if last 4 list-items having non letters
    *args: item-string
    return: Bool
    """
    chars = []
    for idx in range(len(list[-10:])):
        idx = idx*(-1)-1
        if re.match(r"\W",list[idx]):
            chars.append(idx)
            if len(chars) > 4:
                return True
        else:
            return False

before = df_news.shape[0]
df_news = df_news[df_news["Content"].apply(bad_format) == False]
after = df_news.shape[0]

print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:    998
Rows left: 173028


In [20]:
df_news.to_pickle("data/data_preprocessed.pickle")

#df_news = pd.read_pickle("data/data_vocab.pickle")