## Dissertation Project - News Summary Dissertation [100 marks]

### Motivation 

> 1. Provide tools for anyone needing to speed up their research process
> 2. Providing ways for user to quickly determine whether a piece of research is beneficial for their specific search terms




In [1]:
# from IPython.display import HTML

# HTML('''<script>
# code_show=true; 
# function code_toggle() {
#  if (code_show){
#  $('div.input').hide();
#  } else {
#  $('div.input').show();
#  }
#  code_show = !code_show
# } 
# $( document ).ready(code_toggle);
# </script>
# <form action="javascript:code_toggle()"><input type="submit" value="Toggle code"></form>''')

### Imports

In [3]:
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
import random
from tqdm.notebook import tqdm
import ipywidgets as widgets
import fitz
import fasttext

import numpy as np 
import pandas as pd
import math
import time

import json
import pickle
import _pickle as pickle

import transformers
from simplet5 import SimpleT5
from transformers import AutoTokenizer,BertTokenizer,T5Tokenizer,T5Config

DATASET = "./Dataset/"

Global seed set to 42


In [4]:
pip freeze > requirements2.txt

Note: you may need to restart the kernel to use updated packages.




Detect which device (CPU/GPU) to use.

In [2]:
seed = 0
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed) 
torch.cuda.manual_seed_all(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


### Overview

Summarisation model
> 1. Dataset preprocessing
> 2. Dataloader
> 3. RNN model definition
> 4. Model training
> 5. Model prediction evaluation
> 6. Dataset Exploration
> 7. Dataset modification/Data Augmentation
> 8. Model improvement
> 9. Model finalisation and evaluation

Paper querying
> 1. Attention on query (Return usefulness percentage
> 2. Evaluate performance


# Data preprocessing

In [4]:
# # Comment this block if dataset is reorganised
# DATA_DIR = "SSN/papers.SSN.jsonl"
# dataset_path = DATASET+DATA_DIR
# with open(dataset_path) as f:
#     lines = f.read().splitlines()
# df_inter = pd.DataFrame(lines)
# df_inter.columns = ['json_element']
# df_final = pd.json_normalize(df_inter['json_element'].apply(json.loads))
# df_final.to_json("./Dataset/SSN/SSN_Dataset.json")
# df_final

In [5]:
# for i,summary in df_final.iterrows():
#     temp = summary["abstract"]
#     print(type(summary["abstract"]))
#     break

In [6]:
# #Comment this block if dataset is shortened
# DATA_DIR = "SSN/SSN_Dataset.json"
# dataset_path = DATASET+DATA_DIR
# df = pd.read_json(dataset_path)
# df

In [7]:
# df.iloc[140794]["section_names"]

Function to return index for conclusion section

In [8]:
# def trim_index(row):
#     return [row.index(x)+1 for x in row if x.startswith('conclusion') or x.startswith("summar")]

In [9]:
# def trim_text(text, index):
#     return text[0:index]

In [10]:
# # Comment this block if dataset is shortened
# # Trim text after conclusion
# indexes = []
# for i, row in df.iterrows():
#     section = row["section_names"]
#     #print(section)
#     index = trim_index(section)
#     #print(index)
#     if not index:
#         indexes.append(i)
#     # if section can be filtered
#     else:
#         index = index[0]
#         abstract = row["abstract"]
#         text = row["text"]
#         section = row["section_names"]
#         df.at[i, "section_names"] = trim_text(section, index)
#         df.at[i, "abstract"] = trim_text(abstract, index)
#         df.at[i, "text"] = trim_text(text, index)
# # dropping rows in dataframe that can't easily filter out reference section
# print(len(indexes))
# df.drop(indexes, inplace=True)
# df.to_json("./Dataset/SSN/SSN_Dataset_Short.json")

35% of paper will be removed from the dataset due to it not having conclusion(s) and summary(ies) in their section titles, making it difficult to filter out the reference and appendix text

In [11]:
# DATA_DIR = "SSN/SSN_Dataset_Short.json"
# dataset_path = DATASET+DATA_DIR
# df = pd.read_json(dataset_path)
# df.reset_index(drop=True, inplace=True)
# df

See diversity in papers, select only computer science

In [12]:
# plot_points = df['domain'].value_counts()
# plot_points

In [13]:
# df = df.loc[df['domain'].isin([['Computer science']])]
# df.to_json("./Dataset/SSN/SSN_Dataset_CompSci_Short.json")
# df

In [14]:
# DATA_DIR = "cnn_dailymail/train.csv"
# dataset_path = DATASET+DATA_DIR
# df = pd.read_csv(dataset_path)
# df = df.rename(columns={'article': 'text', 'highlights': 'abstract'})
# df

In [15]:
# DATA_DIR = "SSN/SSN_Dataset_CompSci_Short.json"
# dataset_path = DATASET+DATA_DIR
# df = pd.read_json(dataset_path)
# df.reset_index(drop=True, inplace=True)
# df

In [16]:
# summary_df = df[["abstract", "text"]]
# summary_df

Check if any columns contain empty values

In [17]:
# summary_df.isnull().any()

\'hi\' \ is used for space <br>
random space, which is used for reference [15] quote box <br>
sec ref is hyperlink to a section <br>
fig ref is hyperlink to a figure <br>
inlineform <br>
displayform are both symbols, both contains numbers in string <br>
remove all forms and remove all symbols but keep numbers <br>

In [18]:
def contain_let(string):
    return any(char.isalpha() for char in string)

In [19]:
def contain_num(string):
    return any(char.isdigit() for char in string)

In [20]:
def contain_special(string, allowed):
    '''
    allowed is a list containing allowed symbols to pass detection
    '''
    return any(not(char.isalpha() or char.isdigit()) and (char not in allowed) for char in string)

In [21]:
def cleanLine(line, text=True, aug=False):
    "text parameter is to indicate whether the line is from text or abstract"
    symbols = ["'", "’"]
    stop_words = list(ENGLISH_STOP_WORDS)
    
    clean_line = line.lower()

    # fix apostrophes in line by removing apostrophe with no following alphabet character
    clean_line = clean_line.replace("'", " ")
    clean_line = clean_line.replace(",", " ")
#     # remove apostrophe if last character is apostrophe
#     if clean_line and (clean_line[-1] == "'"):
#         clean_line = clean_line[0:len(clean_line)-1]
#     # fix apostrophes in line by removing space before single quote
#     clean_line = clean_line.replace(" '", "'")


    # clean line = clean line remove forms
    words = clean_line.split()
    #  remove forms, words with special characters inside
    # if contain letter and number
    # if contain special character not in allowed symbols and removing punctuations
    # then remove
    words = [x.replace(x, "") if (contain_let(x) and contain_num(x))
             or contain_special(x, symbols)
             else x for x in words]

    # remove empty strings
    words = filter(None, words)

    # stop words from sklearn, remove stop words
    if text:
        words = [x for x in words if not x in stop_words]
    # remove from line randomly
    if text and aug:
        choices = random.choices(words, k=math.floor(len(line)*0.2))
        words.remove(choices)
        
     # combine the items into 1 string
    clean_line = ' '.join(words)

    

    return clean_line

In [22]:
# def concatParagraph(paragraph, text=True):
#     clean_paragraph = ""
#     for line in paragraph:
#         lines = cleanLine(line)
#         clean_paragraph += cleanLine(lines, text) + " "
#         #print(clean_paragraph)
        
#     return(clean_paragraph.strip())

In [23]:
# def concatPaper(paper, text=True):
#     clean_paper = ""
#     for paragraph in paper:
#         clean_paper += concatParagraph(paragraph, text) + " "
#     return(clean_paper.strip())

In [24]:
# import time
# interval = 0.0001
# aug_chance = 0.5
# for i, row in tqdm(summary_df.iterrows(), total=summary_df.shape[0]):
#     abstract = row["abstract"]
#     paper = row["text"]
    
# #     summary_df.at[i, "abstract"] = concatParagraph(abstract, text=False)
# #     summary_df.at[i, "text"] = concatPaper(paper)
#     if random.random() > aug_chance:
#         row_val = cleanLine(paper, text=True, aug=True)
#         new_row = pd.Series([abstract, row_val], index=summary_df.columns)
#         df_summary = df_summary.append(row1,ignore_index=True) 
#         df_summary
#     summary_df.at[i, "abstract"] = cleanLine(abstract, text=True)
#     summary_df.at[i, "text"] = cleanLine(paper)
#     time.sleep(interval)

In [25]:
# summary_df.to_csv("./Dataset/cnn_dailymail/cleaned_cnn_train.csv")
# summary_df

In [3]:
# DATA_DIR = "cnn_dailymail/cleaned_cnn_train_150_short3.csv"
# dataset_path = DATASET+DATA_DIR
# df = pd.read_csv(dataset_path)
# df = df[["source_text", "target_text"]]
# df

In [24]:
DATA_DIR = "SSN\SSN_Dataset_Short_Clean.json"
dataset_path = DATASET+DATA_DIR
df = pd.read_json(dataset_path)
#df = df[["source_text", "target_text"]]
# df = df.rename(columns={'text': 'source_text', 'abstract': 'target_text'})
#df['source_text'] = "summarize: " + df['source_text']
df

Unnamed: 0,abstract,text
0,tree boosting highly effective widely used mac...,machine learning data driven approaches import...
1,face alignment task finding locations set faci...,face alignment refers finding pixel locations ...
2,study pattern forming nonlinear dynamics start...,modulation instability mi fundamental process ...
3,investigate infrared dynamics nonsupersymmetri...,understanding strong dynamics constitutes cont...
4,propose new framework constructing polar codes...,polar codes family codes proven capacity achie...
...,...,...
91381,propose dirichlet process mixtures generalized...,paper examine general regression problem gener...
91382,classify possible new u 1 x su 2 x su 3 multip...,higgs mass hierarchy puzzle suggests new physi...
91383,second order linear ordinary diffrential equat...,motivation writing paper observation small app...
91384,compare predictions spin independent contribut...,minimal supersymmetric standard model mssm bes...


In [20]:
t_max = 0
a_max = 0
for i, row in tqdm(df.iterrows(), total=len(df)):
    t_len = len(tokeniser(row['source_text'])['input_ids'])
    a_len = len(tokeniser(row['target_text'])['input_ids'])
    if t_len > t_max:
        t_max = t_len
    if a_len > a_max:
        a_max = a_len
print("t_max: ",t_max)
print("a_max: ", a_max)

  0%|          | 0/110315 [00:00<?, ?it/s]

t_max:  1000
a_max:  100


In [6]:
len(indexes)

2163

In [7]:
df.drop(indexes, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_json(DATASET+"SSN\SSN_Dataset_CompSci_Short_Clean_HalfStop_170_Filter5.json")
df

Unnamed: 0,target_text,source_text
0,tracking developments highly dynamic data tech...,summarize: ubiquity online resources massive g...
1,paper propose new method enhance mapping paral...,summarize: large scale graph based application...
2,investigate models mitogenactivated protein ki...,summarize: mathematical modelling intra cellul...
3,machine learning used number security related ...,summarize: nowadays machine learning used numb...
4,propose novel pose robust spatial aware gan ps...,summarize: work solve makeup transfer task tra...
...,...,...
7982,image restoration extensively researched topic...,summarize: image denoising problem researched ...
7983,present neural encoder decoder model convert i...,summarize: optical character recognition ocr c...
7984,ability detect pedestrians moving objects cruc...,summarize: autonomous cars currently primed ma...
7985,goal paper use multi task learning efficiently...,summarize: slot filling models useful method s...


In [4]:
# ABSTRACT_MAX = 170
# TEXT_MAX = 3050
ABSTRACT_MAX = 200
TEXT_MAX = 2800

In [19]:
tokeniser = AutoTokenizer.from_pretrained('google/t5-efficient-small') 
# takes into account of apostrophe

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

In [54]:
text = df.at[0, "target_text"]
print(len(tokeniser(text)["input_ids"]))
for i in tokeniser(text)["input_ids"]:
    print(i,tokeniser.decode(i))
tokeniser.decode(tokeniser(text)["input_ids"])

100
6418 tracking
11336 developments
1385 highly
4896 dynamic
331 data
748 technology
3283 landscape
3362 vital
2627 keeping
3714 novel
2896 technologies
1339 tools
796 various
844 areas
7353 artificial
6123 intelligence
3 
9 a
23 i
1256 difficult
1463 track
2193 relevant
748 technology
12545 keywords
1040 paper
4230 propose
3714 novel
7181 addresses
682 problem
1464 tool
261 used
3269 automatically
8432 detect
6831 existence
126 new
2896 technologies
1339 tools
1499 text
5819 extract
1353 terms
261 used
126 new
2896 technologies
21527 extracted
126 new
1353 terms
3 
13880 logged
126 new
3 
9 a
23 i
2896 technologies
3971 fly
765 web
3 
14064 subsequently
12910 classified
2193 relevant
27632 semantic
11241 labels
3 
9 a
23 i
3303 domain
7 s
4382 proposed
1464 tool
3 
390 based
1726 stage
1990 cas
658 ca
26 d
53 ing
825 model
1726 stage
853 class
15821 ifies
7142 sentence
2579 contains
748 technology
1657 term
511 second
1726 stage
3 
8826 identifie
7 s
748 technology
15693 keyword
7142

'tracking developments highly dynamic data technology landscape vital keeping novel technologies tools various areas artificial intelligence ai difficult track relevant technology keywords paper propose novel addresses problem tool used automatically detect existence new technologies tools text extract terms used new technologies extracted new terms logged new ai technologies fly web subsequently classified relevant semantic labels ai domains proposed tool based stage cascading model stage classifies sentence contains technology term second stage identifies technology keyword sentence obtain competitive accuracy tasks sentence classification text identification</s>'

In [32]:
# rows = []
# t_max= 2524
# a_max=  118
# t_min=  668
# a_min=  39
# for i, row in df.iterrows():
#     t_word = len(row["text"].split())
#     a_word = len(row["abstract"].split())
#     if ((t_word > t_max or t_word < t_min) or (a_word > a_max or a_word < a_min)):
#         rows.append(i)

# df.drop(rows, inplace=True)
# df.reset_index(drop=True, inplace=True)
# df.to_json(DATASET+"SSN/SSN_Dataset_CompSci_Short_Clean_HalfStop_180.json")
# df

Convert DF to text

In [25]:
# text_txt = df['text']
# #text_txt = text_txt.str.removeprefix("summarize: ")
# text_txt.to_csv("text_full_cnn.txt", header=False,index=False)
# # write to file. 

In [26]:
# vocab_model = fasttext.train_unsupervised('text_full_cnn.txt', minn=2, epoch=10)
# vocab_model.save_model("fastText_full_NoStop.bin")

In [61]:
vocab_model.get_word_vector('angle')

array([ 0.05246937, -0.06849538, -0.42895123,  0.25144276, -0.02151984,
        0.05849433,  0.22607356, -0.2634456 , -0.04034835,  0.09165015,
        0.23875414,  0.23043832,  0.335529  , -0.125991  , -0.23940863,
       -0.31306407,  0.2726784 , -0.19945174,  0.09254331, -0.26865605,
        0.1550692 ,  0.06794629, -0.12334667, -0.27257252, -0.63276577,
       -0.00525512,  0.22396798,  0.46203488,  0.38846275,  0.4881754 ,
       -0.33606574, -0.09112427,  0.25175312, -0.6451501 , -0.12379882,
       -0.27533957, -0.48192084,  0.24728948,  0.35500985,  0.16617337,
       -0.14461681, -0.455554  , -0.2539027 , -0.03580671,  0.27836385,
       -0.20429875,  0.16872005, -0.24842629,  0.55424565,  0.37504348,
       -0.12348666, -0.17431735, -0.02775216,  0.21767178, -0.32350695,
        0.27022526,  0.169276  , -0.02366772,  0.0065518 , -0.39988786,
        0.6083963 ,  0.13710539,  0.3647993 , -0.22767365,  0.21434541,
       -0.06064311,  0.19077592, -0.6789286 ,  0.12426411, -0.13

In [60]:
vocab_model.get_nearest_neighbors('angle')

[(0.9149702191352844, 'angles'),
 (0.7963276505470276, 'azimuthly'),
 (0.790732204914093, 'radian'),
 (0.7903612852096558, "angle's"),
 (0.7893282771110535, 'radians'),
 (0.7763140797615051, 'azimuth'),
 (0.7654001116752625, 'azimuthes'),
 (0.7620179653167725, 'azimuthial'),
 (0.738439679145813, 'plane'),
 (0.7361207604408264, 'altazimuth')]

# Dataloader

Split dataset

In [5]:
train_data, test_data = train_test_split(df, random_state=seed, train_size = 0.7)
train_data, valid_data = train_test_split(train_data, random_state=seed, train_size=0.8)

In [64]:
model = SimpleT5()
model.load_model(model_type="t5", model_dir="outputs/best", use_gpu=True)

In [None]:
model.train(train_df=train_data,
        eval_df=valid_data,
        source_max_token_len=TEXT_MAX,
        target_max_token_len=ABSTRACT_MAX,
        batch_size=2,
        max_epochs=8,
        use_gpu=True,)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 31.2 M
-----------------------------------------------------
31.2 M    Trainable params
0         Non-trainable params
31.2 M    Total params
124.882   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [9]:
model_config = model.model.config.to_dict()
for k, v in model_config.items():
    print(k, v)
print("----------------")
# model_config = model.model.config.to_dict()
# model_config["num_heads"] = 8
#model_config["dropout_rate"] = 0.2
# model_config["d_model"] = 512
model_config["temperature"] = 1 #1 or >1 for more random
model_config["do_sample"] = True
config = T5Config(**model_config)
# model = AutoModelForSeq2SeqLM.from_config(config)
model.model.config = config
model.model.config

vocab_size 32128
d_model 512
d_kv 64
d_ff 2048
num_layers 6
num_decoder_layers 6
num_heads 8
relative_attention_num_buckets 32
dropout_rate 0.1
layer_norm_epsilon 1e-06
initializer_factor 1.0
feed_forward_proj relu
use_cache True
return_dict True
output_hidden_states False
output_attentions False
torchscript False
torch_dtype float32
use_bfloat16 False
pruned_heads {}
tie_word_embeddings True
is_encoder_decoder True
is_decoder False
cross_attention_hidden_size None
add_cross_attention False
tie_encoder_decoder False
max_length 20
min_length 0
do_sample False
early_stopping False
num_beams 1
num_beam_groups 1
diversity_penalty 0.0
temperature 1.0
top_k 50
top_p 1.0
repetition_penalty 1.0
length_penalty 1.0
no_repeat_ngram_size 0
encoder_no_repeat_ngram_size 0
bad_words_ids None
num_return_sequences 1
chunk_size_feed_forward 0
output_scores False
return_dict_in_generate False
forced_bos_token_id None
forced_eos_token_id None
remove_invalid_values False
architectures ['T5ForConditionalGen

T5Config {
  "_name_or_path": "outputs/simplet5-epoch-0-train-loss-2.7644-val-loss-2.4369",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "do_sample": true,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English

In [49]:
for i,row in valid_data.iterrows():
    inputs = row["source_text"]
    output = model.predict(inputs,length_penalty=10,num_beams=25, repetition_penalty=2.5)
    print(row["target_text"])
    print("-----------------")
    print(output)
    print("")
    break
    
# inputs = valid_data.at[0,"source_text"]
# print(inputs)
# print("")
# print(valid_data.at[0,"target_text"])
# model.device = "cpu"
# model.predict(inputs,max_length=100)

online social networks represent main source communication information exchange today's life facilitate exquisitely news sharing knowledge elicitation forming groups interests researchers decades studied growth dynamics online social networks extensively questing clear understanding behavior humans online social networks helps directions like engineering better recommendation systems attracting new members social networks achieved desired growth example online social networks like myspace orkut friendster service today work present probabilistic theoretical model captures dynamics social decay inactivity members social network model proved interesting mathematical properties imply achieving model optimization reasonable performance means maximization problem approximated factor 1 1 minimization problem achieved polynomial time
-----------------
['paper present empirical analysis social decay dynamics closed stack exchange websites presented model capturing decay dynamics social network