In [1]:
import pandas as pd

In [2]:
df_news = pd.read_pickle("data/data_raw.pickle")

### Cleaning up

In [3]:
before = df_news.shape[0]

to_drop = ["","[]"]
for col in df_news.columns:
    if col != "ID":
        for item in to_drop:
            df_news = df_news[df_news[col] != item]
        df_news = df_news[df_news[col].notnull()]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:  19076
Rows left: 185059


### Fixing interpunction misstakes

In [4]:
import re
from functools import reduce

In [5]:
def fix_interpunction(item):
    """
    Inserting space when missing after sentence ending.
    *args: item string
    return: fixed item string
    """
    match = re.search(r"[\w\d][\.\!\?][\w\d]",item)
    if match:
        item = item[:match.start()+2] + str(" ") + item[match.end()-1:]
        return fix_interpunction(item)
    else:
        return item

df_news["Content"] = df_news["Content"].apply(fix_interpunction)

### Keeping only first ten sentences of "Content"

In [6]:
def keep_10(row):
    """Only keep first 10 sentences of Content
    *args: pandas row object
    return: string
    """
    return reduce(lambda i,j: i + j, re.split(r'([\.\!\?]\s)', row ,10)[:20])

df_news["Content"] = df_news["Content"].apply(keep_10)

### Replace lot of spaces with only one

In [7]:
def replace_spaces(string):
    """
    checking if there are mor then 2 concatenated spaces and replace it with one.
    *args: string
    return: string
    """
    return re.sub(r"\s{2,}"," ",string)

df_news["Content"] = df_news["Content"].apply(replace_spaces)

### Deleting not expected chars from content

In [8]:
def bad_format_1(string):
    """
    replace not expacted chars with space
    *args: content string
    return: string
    """
    return re.sub(r"[\\\,\'\s]{3,}(?=[A-Z]|\")"," ",string)

df_news["Content"] = df_news["Content"].apply(bad_format_1)

In [9]:
def bad_format_2(string):
    """
    deleting all backslashes
    *args: content-string
    return: string
    """
    return re.sub(r"\\{1,}","",string)

df_news["Content"] = df_news["Content"].apply(bad_format_2)

### Dropping rows the content has bunch of non letter chars

In [10]:
def non_char_bunch(string):
    """
    checking if string has any bunch of non letter chars in concatenation
    
    *args: string
    return: Bool
    """
    if re.search(r"(\\{1,}\'{1,})|(\\{2,})|(\{{1,})",string): return True
    else: return False

before = df_news.shape[0]

df_news = df_news[df_news["Content"].apply(non_char_bunch)==False]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:    294
Rows left: 184765


### Dropping rows the headline occurs in Content

In [11]:
def headline_in_content(row):
    is_in = row["Content"].find(row["Headline"])
    if is_in == -1:
        return False
    else:
        return True

before = df_news.shape[0]

df_news = df_news[df_news.apply(headline_in_content, axis=1) == False]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:    839
Rows left: 183926


### Dropping duplicate rows

In [12]:
before = df_news.shape[0]

df_news = df_news.drop_duplicates("Headline")

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:   7626
Rows left: 176300


### Dropping rows the content is snippet

In [13]:
def is_snippet(content):
    if content.find("... Continue") != -1: return True
    else: return False

before = df_news.shape[0]

df_news = df_news[df_news["Content"].apply(is_snippet)==False]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:    103
Rows left: 176197


### Lowercasing and Tokenizing

In [14]:
import nltk
import time
# Only when first using nltk
#nltk.download()

In [15]:
df_news["Headline"] = df_news["Headline"].str.lower()
df_news["Content"] = df_news["Content"].str.lower()

In [16]:
tokenize_start = time.time()
df_news["Headline"] = df_news["Headline"].apply(nltk.word_tokenize)
tokenize_end = time.time()
print("Headline done. Took {0:2.1f} minutes.".format((tokenize_end - tokenize_start)/60))

tokenize_start = time.time()
df_news["Content"] = df_news["Content"].apply(nltk.word_tokenize)
tokenize_end = time.time()
print("Content Done. Took {0:2.1f} minutes.".format((tokenize_end - tokenize_start)/60))

Headline done. Took 0.3 minutes.
Content Done. Took 4.6 minutes.


### Dropping rows the headline is > 25 tokens

In [17]:
def headline_gt_25(item):
    """
    rows with more than 25 tokens in headline
    
    *args: row
    return: row
    """
    if len(item) > 25: return True
    else: return False

before = df_news.shape[0]

df_news = df_news[df_news["Headline"].apply(headline_gt_25) == False]

after = df_news.shape[0]
print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:    588
Rows left: 175609


### Dropping rows the content is shorter then 25 tokens

In [18]:
before = df_news.shape[0]
df_news = df_news[df_news["Content"].apply(lambda x: len(x)) > 25]
after = df_news.shape[0]

print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:    693
Rows left: 174916


### Dropping rows the > 4 last tokens are non character

In [19]:
def bad_format(list):
    """
    giving true if last 4 list-items having non letters
    *args: item-string
    return: Bool
    """
    chars = []
    for idx in range(len(list[-10:])):
        idx = idx*(-1)-1
        if re.match(r"\W",list[idx]):
            chars.append(idx)
            if len(chars) > 4:
                return True
        else:
            return False

before = df_news.shape[0]
df_news = df_news[df_news["Content"].apply(bad_format) == False]
after = df_news.shape[0]

print("Rows dropped: {0:6d}".format(before - after))
print("Rows left: {0:6d}".format(after))

Rows dropped:   2238
Rows left: 172678


In [20]:
df_news.to_pickle("data/data_preprocessed.pickle")

#df_news = pd.read_pickle("data/data_preprocessed.pickle")

### Shortening Content down to last sentence having 50th token

## TODO
Producing None. ID = 8246, 8583

In [42]:
def shorten_content(item):
    """
    Shortening content down to <= 50 tokens be aware of sentences.
    
    *args: content item list
    return: shortend content item list
    """
    if len(item) > 50:
        endings = []
        for i in range(len(item)):
            result = re.search(r"(\.)|(\!)|(\?)|(\.\.\.)",item[i])
            if result:
                endings.append(i)
            elif endings:
                if endings[-1] > 49:
                    return item[:endings[-1]+1]
    else:
        return item

df_short = df_news["Content"].apply(shorten_content)

In [44]:
df_short[df_short.isna() == True]

3880      None
4277      None
4351      None
4720      None
5427      None
6295      None
7192      None
7222      None
9275      None
10719     None
13212     None
14583     None
18427     None
18970     None
19163     None
20530     None
21320     None
21749     None
22034     None
22327     None
22472     None
23090     None
23391     None
23802     None
24213     None
24322     None
24473     None
27005     None
27195     None
27367     None
          ... 
176813    None
177919    None
178418    None
180106    None
180622    None
182536    None
182812    None
182957    None
183765    None
185268    None
188164    None
190044    None
190375    None
190919    None
191228    None
191365    None
192003    None
195777    None
196251    None
196690    None
196825    None
197399    None
197572    None
197596    None
200866    None
201855    None
202505    None
203467    None
203646    None
204024    None
Name: Content, Length: 281, dtype: object

In [None]:
#re.search(r"(\.)|(\!)|(\?)|(\.\.\.)",)

In [46]:
df_news["Content"].iloc[3880]

['``',
 '\\n\\ntens',
 'of',
 'thousands',
 'of',
 'migrants',
 'fleeing',
 'war',
 'and',
 'poverty',
 'are',
 'trying',
 'to',
 'reach',
 'safety',
 'in',
 'europe',
 '.',
 'this',
 'summer',
 "'s",
 'unprecedented',
 'exodus',
 'from',
 'syria',
 ',',
 'afghanistan',
 ',',
 'eritrea',
 'and',
 'other',
 'nations',
 'in',
 'turmoil',
 'has',
 'inundated',
 'southern',
 'europe',
 'and',
 'exhausted',
 'the',
 'generosity',
 'of',
 'countries',
 'suffering',
 'their',
 'own',
 'economic',
 'woes',
 '.',
 '``',
 ',',
 "'migrants",
 'have',
 'flooded',
 'into',
 'slovenia',
 'since',
 'hungary',
 'closed',
 'its',
 'border',
 'with',
 'serbia',
 'to',
 'people',
 'making',
 'arduous',
 'treks',
 'across',
 'europe',
 'in',
 'september',
 'and',
 'clamped',
 'down',
 'on',
 'its',
 'border',
 'with\\xa0',
 'croatia',
 'on',
 'saturday.\\xa0',
 "'",
 ',',
 "'since",
 'then',
 ',',
 '21,500',
 'people',
 'have',
 'entered',
 'slovenia',
 'from',
 'croatia',
 ',',
 'with',
 'many',
 'thousa

### Replace rare words with ```<unk>``` 

Keeping only 40k most frequent words

### Adding ```<eos>``` to the end of content list

### Seperating punctuation from words

In [None]:
def sup_punct(list):
    """
    seperating punctuation from words
    *args: list of strings
    return: list of strings
    """
    results = {
        "points": [],
        "quotes" []
    }
    
    for idx,item in enumerate(list):
        foud_dot = re.search(r"", item)
        found_quote = re.search(r"", item)
        if found_dot:
            
        if found_quote:
            
    
    if hochkomma:
        list = 
    elif punkt:
        
    else:
        return list