In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
val_df = pd.read_csv('validation.csv')

# Separate inputs (articles) and targets (summaries)
X_train, y_train = train_df['article'], train_df['highlights']
X_val, y_val = val_df['article'], val_df['highlights']
X_test, y_test = test_df['article'], test_df['highlights']

In [2]:
from sklearn.utils import shuffle
train_df_shuffled = shuffle(train_df, random_state=42)  # random_state for reproducibility
subset_size = int(0.1 * len(train_df_shuffled)) # take 0.25 of data length 
train_df = train_df_shuffled.iloc[:subset_size]
train_df

Unnamed: 0,id,article,highlights
272581,ed0fed726929c1eeabe6c390e47128dbb7d7a055,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...
772,023cd84001b33aed4ff0f3f5ecb0fdd2151cf543,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...
171868,6a70a0d8d3ed365fe1df6d35f1587a8b9b298618,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...
63167,b37204c13ea38b511265e41ac69fb12acfb63f85,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey .\nMost...
68522,c24e5805afd5145bc48410e876db91d44a06be5e,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...
...,...,...,...
225109,af82d43ecb0fb4ce0bb23e4fda724596181c6709,An ax wielding teen was left crying for his mo...,Store clerk Ali Ait Mahdi said he defended him...
73267,cfca4c7a8eff7f6c1230f0b2866504f1513c68c0,A food supplier falsely marketed beef to Musli...,"Midamar Corp, of Cedar Rapids, Iowa, sold food..."
9840,1be8e21ffcabec519f305f35643821670b485181,New York (CNN) -- Diplomacy hasn't worked. San...,Hafez Nazeri and his father Shahrem play hybri...
210557,9cb1380d06a63402cb38089b4a125184e831bbde,(CNN) -- Is Gail Kelly the most powerful woman...,"Meet Gail Kelly, CEO of Australia's second lar..."


In [3]:
train_df.drop_duplicates(['highlights'], inplace=True)
test_df.drop_duplicates(['highlights'], inplace=True)
train_df = train_df.drop(['id'], axis= 1)
train_df.reset_index(inplace=True, drop=True)
test_df = test_df.drop(['id'], axis= 1)
test_df.reset_index(inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop_duplicates(['highlights'], inplace=True)


In [4]:
train_df

Unnamed: 0,article,highlights
0,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...
1,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...
2,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...
3,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey .\nMost...
4,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...
...,...,...
28647,An ax wielding teen was left crying for his mo...,Store clerk Ali Ait Mahdi said he defended him...
28648,A food supplier falsely marketed beef to Musli...,"Midamar Corp, of Cedar Rapids, Iowa, sold food..."
28649,New York (CNN) -- Diplomacy hasn't worked. San...,Hafez Nazeri and his father Shahrem play hybri...
28650,(CNN) -- Is Gail Kelly the most powerful woman...,"Meet Gail Kelly, CEO of Australia's second lar..."


In [5]:
test_df

Unnamed: 0,article,highlights
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."
...,...,...
11444,Our young Earth may have collided with a body ...,Oxford scientists say a Mercury-like body stru...
11445,A man facing trial for helping his former love...,Man accused of helping former lover kill woman...
11446,A dozen or more metal implements are arranged ...,Marianne Power tried the tuning fork facial at...
11447,Brook Lopez dominated twin brother Robin with ...,Brooklyn Nets beat the Portland Trail Blazers ...


In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    #convert text to doc
    doc = nlp(text)

    # filter out stopwords and punctuation
    cleaned_words = [token.text.lower()
                     for token in doc
                     if not token.is_stop and not token.is_punct]

    return " ".join(cleaned_words)


In [7]:
new_train_df = train_df[:2000]
new_train_df

Unnamed: 0,article,highlights
0,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...
1,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...
2,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...
3,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey .\nMost...
4,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...
...,...,...
1995,By . Nicola Harley . A legal firm that made mo...,Insult came in a training manual Raleys Solici...
1996,The average interest rate on an easy-access IS...,It is the lowest average rate since MoneyFacts...
1997,(CNN) -- There are plenty of reasons to fall i...,Staircases are important elements in home deco...
1998,"By . Steve Robson . PUBLISHED: . 01:29 EST, 25...",Victor Ponta says he is 'rather perplexed' by ...


In [9]:
new_train_df["clean_article"] = new_train_df['article'].apply(clean_text)
new_train_df["clean_highlights"] = new_train_df['highlights'].apply(clean_text)
new_train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train_df["clean_article"] = new_train_df['article'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train_df["clean_highlights"] = new_train_df['highlights'].apply(clean_text)


Unnamed: 0,article,highlights,clean_article,clean_highlights
0,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...,mia de graaf britons flocked beaches southern ...,people enjoyed temperatures 17c brighton beach...
1,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...,couple weighed combined 32st shamed slimming f...,couple started piling pounds birth children \n...
2,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...,video footage shows heart stopping moment 17 y...,17 year old boy suffering lacerations left han...
3,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey .\nMost...,istanbul turkey cnn 250 people raced syrian bo...,syrians citizens hightail turkey \n ethnic tur...
4,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...,daily mail reporter published 12:53 est 3 janu...,xue long provided helicopter helped rescue \n ...
...,...,...,...,...
1995,By . Nicola Harley . A legal firm that made mo...,Insult came in a training manual Raleys Solici...,nicola harley legal firm £ 77 million miners c...,insult came training manual raleys solicitors ...
1996,The average interest rate on an easy-access IS...,It is the lowest average rate since MoneyFacts...,average interest rate easy access isa account ...,lowest average rate moneyfacts began keeping r...
1997,(CNN) -- There are plenty of reasons to fall i...,Staircases are important elements in home deco...,cnn plenty reasons fall love staircase home co...,staircases important elements home decor famil...
1998,"By . Steve Robson . PUBLISHED: . 01:29 EST, 25...",Victor Ponta says he is 'rather perplexed' by ...,steve robson published 01:29 est 25 february 2...,victor ponta says perplexed reports romanian i...


In [10]:
new_test_df = test_df[:2000]
new_test_df

Unnamed: 0,article,highlights
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."
...,...,...
1995,Francois Hollande has vowed to 'show no mercy'...,14 soldiers have been accused of abusing child...
1996,"A holidaymaker still has sickness issues, nine...",James Houlder and girlfriend Vicki Hood stayed...
1997,Jordon Ibe is poised to sign a new five-year c...,Jordon Ibe will sign a new five-year contract ...
1998,Paraded in a glass coffin and flanked by jostl...,"Iraqi officials say Izzat Ibrahim al-Douri, 72..."


In [11]:
new_test_df["clean_article"] = new_test_df['article'].apply(clean_text)
new_test_df["clean_highlights"] = new_test_df['highlights'].apply(clean_text)
new_test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_test_df["clean_article"] = new_test_df['article'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_test_df["clean_highlights"] = new_test_df['highlights'].apply(clean_text)


Unnamed: 0,article,highlights,clean_article,clean_highlights
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,noticed plane seats appear getting smaller sma...,experts question packed planes putting passe...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,drunk teenage boy rescued security jumping lio...,drunk teenage boy climbed lion enclosure zoo w...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,dougie freedman verge agreeing new year deal r...,nottingham forest close extending dougie freed...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,liverpool target neto wanted psg clubs spain b...,fiorentina goalkeeper neto linked liverpool ar...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",bruce jenner break silence hour interview dian...,tell interview reality tv star 69 air friday a...
...,...,...,...,...
1995,Francois Hollande has vowed to 'show no mercy'...,14 soldiers have been accused of abusing child...,francois hollande vowed mercy french peacekeep...,14 soldiers accused abusing children young \n ...
1996,"A holidaymaker still has sickness issues, nine...",James Houlder and girlfriend Vicki Hood stayed...,holidaymaker sickness issues months contracted...,james houlder girlfriend vicki hood stayed son...
1997,Jordon Ibe is poised to sign a new five-year c...,Jordon Ibe will sign a new five-year contract ...,jordon ibe poised sign new year contract liver...,jordon ibe sign new year contract run 2020 \n ...
1998,Paraded in a glass coffin and flanked by jostl...,"Iraqi officials say Izzat Ibrahim al-Douri, 72...",paraded glass coffin flanked jostling civilian...,iraqi officials izzat ibrahim al douri 72 died...


In [13]:
new_train_df['clean_highlights'] = new_train_df['clean_highlights'].apply(lambda x: '<Start>' + ' ' + x + ' ' + '<END>')
new_train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train_df['clean_highlights'] = new_train_df['clean_highlights'].apply(lambda x: '<Start>' + ' ' + x + ' ' + '<END>')


Unnamed: 0,article,highlights,clean_article,clean_highlights
0,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...,mia de graaf britons flocked beaches southern ...,<Start> <Start> people enjoyed temperatures 17...
1,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...,couple weighed combined 32st shamed slimming f...,<Start> <Start> couple started piling pounds b...
2,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...,video footage shows heart stopping moment 17 y...,<Start> <Start> 17 year old boy suffering lace...
3,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey .\nMost...,istanbul turkey cnn 250 people raced syrian bo...,<Start> <Start> syrians citizens hightail turk...
4,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...,daily mail reporter published 12:53 est 3 janu...,<Start> <Start> xue long provided helicopter h...
...,...,...,...,...
1995,By . Nicola Harley . A legal firm that made mo...,Insult came in a training manual Raleys Solici...,nicola harley legal firm £ 77 million miners c...,<Start> <Start> insult came training manual ra...
1996,The average interest rate on an easy-access IS...,It is the lowest average rate since MoneyFacts...,average interest rate easy access isa account ...,<Start> <Start> lowest average rate moneyfacts...
1997,(CNN) -- There are plenty of reasons to fall i...,Staircases are important elements in home deco...,cnn plenty reasons fall love staircase home co...,<Start> <Start> staircases important elements ...
1998,"By . Steve Robson . PUBLISHED: . 01:29 EST, 25...",Victor Ponta says he is 'rather perplexed' by ...,steve robson published 01:29 est 25 february 2...,<Start> <Start> victor ponta says perplexed re...


In [14]:
new_test_df['clean_highlights'] = new_test_df['clean_highlights'].apply(lambda x: '<Start>' + ' ' + x + ' ' + '<END>')
new_test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_test_df['clean_highlights'] = new_test_df['clean_highlights'].apply(lambda x: '<Start>' + ' ' + x + ' ' + '<END>')


Unnamed: 0,article,highlights,clean_article,clean_highlights
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,noticed plane seats appear getting smaller sma...,<Start> experts question packed planes putti...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,drunk teenage boy rescued security jumping lio...,<Start> drunk teenage boy climbed lion enclosu...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,dougie freedman verge agreeing new year deal r...,<Start> nottingham forest close extending doug...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,liverpool target neto wanted psg clubs spain b...,<Start> fiorentina goalkeeper neto linked live...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",bruce jenner break silence hour interview dian...,<Start> tell interview reality tv star 69 air ...
...,...,...,...,...
1995,Francois Hollande has vowed to 'show no mercy'...,14 soldiers have been accused of abusing child...,francois hollande vowed mercy french peacekeep...,<Start> 14 soldiers accused abusing children y...
1996,"A holidaymaker still has sickness issues, nine...",James Houlder and girlfriend Vicki Hood stayed...,holidaymaker sickness issues months contracted...,<Start> james houlder girlfriend vicki hood st...
1997,Jordon Ibe is poised to sign a new five-year c...,Jordon Ibe will sign a new five-year contract ...,jordon ibe poised sign new year contract liver...,<Start> jordon ibe sign new year contract run ...
1998,Paraded in a glass coffin and flanked by jostl...,"Iraqi officials say Izzat Ibrahim al-Douri, 72...",paraded glass coffin flanked jostling civilian...,<Start> iraqi officials izzat ibrahim al douri...


In [15]:
X = new_train_df['clean_article'].values
y = new_train_df['clean_highlights'].values

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [17]:
article_tokenizer = Tokenizer(num_words=2000, oov_token="<unk>")
article_tokenizer.fit_on_texts(X)
input_sequences = article_tokenizer.texts_to_sequences(X)

# View the tokenized articles
print("Word Index: ", article_tokenizer.word_index)  # The word-index mapping
print("Sequences: ", input_sequences[:3])  # The tokenized form (sequence of word indices)

Sequences:  [[1, 685, 1, 1, 1, 1, 1386, 795, 300, 949, 116, 87, 1, 1, 1, 54, 1470, 1, 1, 1, 1, 4, 1259, 46, 612, 1, 1, 677, 962, 1, 1, 1, 1, 1679, 1, 1, 1, 1347, 1, 1859, 322, 1, 437, 1225, 1, 1246, 1, 1, 1, 1, 1, 229, 46, 612, 1470, 212, 1, 127, 795, 1, 87, 212, 127, 1906, 1470, 1, 1, 1, 1, 1, 1, 1907, 1, 1, 1789, 1, 796, 1386, 155, 1386, 900, 127, 1, 1, 1, 1225, 78, 6, 392, 1494, 1, 1507, 265, 1246, 55, 796, 1837, 1050, 1460, 1, 98, 138, 1, 1430, 1, 1, 1460, 789, 155, 661, 87, 46, 612, 159, 98, 421, 1837, 1, 1, 1, 54, 1225, 1, 1, 133, 335, 1, 612, 374, 168, 277, 1, 1881, 7, 1247, 143, 42, 344, 1, 1, 61, 1, 479, 1443, 612, 1, 1246, 1460, 1, 1, 42, 98, 1, 1470, 1, 1, 212, 1386, 155, 1906, 1334, 1, 479, 1789, 1, 17, 1907, 38, 374, 168, 54, 1470, 1, 1627, 6, 3, 1, 1, 1, 1, 106, 1, 127, 127, 421, 98, 160, 1, 1, 1460, 1907, 595, 477, 1470, 622, 6, 3, 1, 1907, 1134, 1627, 252, 1, 1, 1470, 25, 710, 157, 1838, 310, 1, 1906, 1216, 127, 320, 1, 892, 1906, 1, 1, 322, 1, 789, 1014, 760, 42, 1, 1,

In [18]:
highlights_tokenizer = Tokenizer(num_words=2000, oov_token="<unk>")
highlights_tokenizer.fit_on_texts(y)
output_sequences = highlights_tokenizer.texts_to_sequences(y)

# View the tokenized highlights
print("Word Index: ", highlights_tokenizer.word_index)  # The word-index mapping
print("Sequences: ", input_sequences[:3])  # The tokenized form (sequence of word indices)

Sequences:  [[1, 685, 1, 1, 1, 1, 1386, 795, 300, 949, 116, 87, 1, 1, 1, 54, 1470, 1, 1, 1, 1, 4, 1259, 46, 612, 1, 1, 677, 962, 1, 1, 1, 1, 1679, 1, 1, 1, 1347, 1, 1859, 322, 1, 437, 1225, 1, 1246, 1, 1, 1, 1, 1, 229, 46, 612, 1470, 212, 1, 127, 795, 1, 87, 212, 127, 1906, 1470, 1, 1, 1, 1, 1, 1, 1907, 1, 1, 1789, 1, 796, 1386, 155, 1386, 900, 127, 1, 1, 1, 1225, 78, 6, 392, 1494, 1, 1507, 265, 1246, 55, 796, 1837, 1050, 1460, 1, 98, 138, 1, 1430, 1, 1, 1460, 789, 155, 661, 87, 46, 612, 159, 98, 421, 1837, 1, 1, 1, 54, 1225, 1, 1, 133, 335, 1, 612, 374, 168, 277, 1, 1881, 7, 1247, 143, 42, 344, 1, 1, 61, 1, 479, 1443, 612, 1, 1246, 1460, 1, 1, 42, 98, 1, 1470, 1, 1, 212, 1386, 155, 1906, 1334, 1, 479, 1789, 1, 17, 1907, 38, 374, 168, 54, 1470, 1, 1627, 6, 3, 1, 1, 1, 1, 106, 1, 127, 127, 421, 98, 160, 1, 1, 1460, 1907, 595, 477, 1470, 622, 6, 3, 1, 1907, 1134, 1627, 252, 1, 1, 1470, 25, 710, 157, 1838, 310, 1, 1906, 1216, 127, 320, 1, 892, 1906, 1, 1, 322, 1, 789, 1014, 760, 42, 1, 1,

In [20]:
max_article_len = max(len(i) for i in X)
max_summary_len = max(len(i) for i in y)
encoder_input_size = len(article_tokenizer.word_index) + 1
decoder_input_size = len(highlights_tokenizer.word_index) + 1

In [21]:
# Pad the sequences
max_vocab_size_en = 2000
X_padded = pad_sequences(input_sequences, maxlen=2000)
print(X_padded[0].shape)

(2000,)


In [23]:
# Pad the sequences
max_vocab_size = 200
y_padded = pad_sequences(output_sequences, maxlen=max_summary_len)
print(y_padded[0].shape)

(1168,)
