In [1]:
## Import libraries
import re
import nltk
import spacy
import string
import pickle
import numpy as np
import pandas as pd
en = spacy.load('en_core_web_sm')
sw_spacy = en.Defaults.stop_words
from tqdm.notebook import tqdm_notebook

In [2]:
## Load Data
data_train = pd.read_csv(r"Data\\cnn_dailymail\\train.csv")
# data_test = pd.read_csv(r"dataset\\cnn_dailymail\\test.csv")
# data_val = pd.read_csv(r"dataset\\cnn_dailymail\\validation.csv")

In [3]:
data_train = data_train.sample(n = 25000)
data_train

Unnamed: 0,id,article,highlights
103297,11334631ba00e479574bffdb366dbad08d3a6c17,"By . Steve Nolan . PUBLISHED: . 13:58 EST, 20 ...",Kidderminster Harriers game with Stockport sus...
251700,d1c94f42180bceb8202293ee69b0050af1dce888,Islamabad (CNN) -- A campaign to eradicate pol...,Six polio vaccination workers were killed in a...
6326,11f1e3830b0fcdcf0b2820e7297fd7ec01522858,"By . Ruth Styles . PUBLISHED: . 10:12 EST, 8 D...",The handset is the work of Swiss designer Ales...
108340,17b1c6d5ddcabc8581bccc9b5cdfbacc0eb28824,Apple will unveil the latest version of its iP...,Report: New iPhone coming on September 10 .\nP...
8386,17a8f106f7bfd8bfe868dcc145f9fa64f06badc9,"By . Kieran Corcoran . PUBLISHED: . 11:45 EST,...","John Constantine, 33, strangled Karen Welsh at..."
...,...,...,...
108689,1826bcc8f3823a98a25b4e233c3eb7f7b1c8688e,By . Sarah Michael . James Packer and David Gy...,Casino mogul and Nine CEO given criminal infri...
144261,468c3d70aa6617db49254cc1344777a97ede3ae5,The year's first colchicums bring a blend of j...,Buy them now as dry bulbs and expect fresh blo...
130677,350097b577f79ee463810c59ab60590c6b53990d,A paraplegic veteran gave his wife the best su...,"Sgt. Joey Johnson of Fishers, Indiana, surpris..."
62795,b2604b455820a12d0f4d63cda6e4483238f472c5,By . Luke Garratt . A schoolboy was airlifted ...,Schoolboy was on an Easter holiday day out wit...


In [4]:
def clean_article(article):
    # Remove "(CNN) -- "
    index = article.find('(CNN) -- ')
    if index > -1:
        article = article[index+len('(CNN)'):]
    # Removing source information    
    article = re.sub(r'By\s\..*?\s\.', '', article)
    article = re.sub(r'PUBLISHED:\s\..*?\s\.', '', article)
    article = re.sub(r'UPDATED:\s\..*?\s\.', '', article)
    # Removing space before period "."
    article = re.sub(r'\s(\.)', r'\1', article)
    # Removing unwanted periods
    cleaned_article = re.sub(r'\.\s([a-z0-9])', r' \1', article)
    # Removing hypens 
    article = re.sub(r'-', r' ', cleaned_article)
    # Removing all punctuations except period and hypens
    article = re.sub(r'[^\w\s\.-]', '', article)
    # Removing multiple spaces in-between words
    article = re.sub(r'\s{1,}', r' ', article)
    return article, cleaned_article

In [5]:
# Separating sentences 
def generate_sen(article):
    original_sentences = nltk.tokenize.sent_tokenize(article)
    original_sentences = [re.sub(r'\.', '', sen).strip() for sen in original_sentences if len(sen) > 2]
    # sentences = [sen.lower() for sen in sentences if len(sen) > 0]
    return original_sentences

In [6]:
# Removing Stopwords and Lemmatization
def stopwords_n_lemma(original_sentences):
    cleaned_sentences = []
    for sen in original_sentences:
        new_sen = ""
        for word in sen.split():
            if word.lower() not in sw_spacy:
                new_sen += word.lower() + " "
        cleaned_sentences.append(" ".join([token.lemma_ for token in en(new_sen.strip())]))
    return cleaned_sentences

In [7]:
def data_preprocess(articles):
    original_articles = []
    cleaned_articles = []
    cleaned_articles_merged = []
    for article in tqdm_notebook(articles):
        article, cleaned_article = clean_article(article)
        cleaned_sentences = stopwords_n_lemma(generate_sen(article))
        original_articles.append(generate_sen(cleaned_article))
        cleaned_articles.append(cleaned_sentences)
        cleaned_articles_merged.append(" ".join(cleaned_sentences))
    return original_articles, cleaned_articles, cleaned_articles_merged

In [8]:
articles = data_train['article'].values
original_articles, cleaned_articles, cleaned_articles_merged = data_preprocess(articles)

  0%|          | 0/25000 [00:00<?, ?it/s]

In [17]:
# Store cleaned data
cleaned_training_data = [new_original_articles, cleaned_articles, cleaned_articles_merged]
pickle.dump(cleaned_training_data, open(r"Data\\cleaned_training_data.pkl", "wb"))

In [33]:
# Save dataframe
data_train.reset_index(drop=True).to_csv(r"Data\\cleaned_training_data.csv", index=False)