diff --git a/projects/web page summation/.gitignore b/projects/web page summation/.gitignore new file mode 100644 index 00000000..00c7b95b --- /dev/null +++ b/projects/web page summation/.gitignore @@ -0,0 +1,138 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +venv/ +env.bak/ +venv.bak/ +env/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +__pycache__ +.vscode +settings.json + +Dependency directories +node_modules/ +jspm_packages/ + +# Optional npm cache directory +.npm +.DS_Store +.DS_Store +datasets +datasets/ +new_datasets/ +node_modules +yarn.lock +app +__pycache__/ +dist +build +mlclassification-darwin-x64 +release-builds +Classifi +app +dist +build +Summarize.spec +__pycache__ +applog.log +csv/ +beneficiary.csv +.DS_Store +applog.log diff --git a/projects/web page summation/README.md b/projects/web page summation/README.md new file mode 100644 index 00000000..dab9f8ee --- /dev/null +++ b/projects/web page summation/README.md @@ -0,0 +1,84 @@ +# Website Summarization API + +This project is carried out for the purpose of building a machine learning model for summarising a website from urls; + +## Getting Started + +These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. + + +### Prerequisites + +Python distribution + +``` +Anaconda +``` + +### Installing + +Install Anaconda python distribution on your system + +Create a virtual environment called env. + +``` +python -m venv app +``` + +Activate the virtual environment + +``` +LINUX/Mac: source app/bin/activate + +Windows: app\Scripts\activate +``` + +Upgrade to the latest pip + +``` +pip install --upgrade pip +``` + +Install dependencies using requirements file + +``` +pip install -r requirements.txt +``` +**Note: Your virtual environment must always be activated before running any command** + +## Deployment + +Start app (Make sure to enter a valid website to an existing website) + + +Example of valid commands + +``` +python app.py simple --url https://facebook.com --sentence 1 --language english +python app.py simple --url https://facebook.com +python app.py simple --url https://korapay.com +python app.py bulk --path ./csv/valid_websites.csv +``` + + +### APIs + +This are command options in full: + +``` +A command line utility for website Summarization. +----------------------------------------------- +These are common commands for this app. + +positional arguments: + action This has to be 'summarize' + +optional arguments: + -h, --help show this help message and exit + --website PATH website of the url to be summarised + + +## License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE.md) file for details + diff --git a/projects/web page summation/app.py b/projects/web page summation/app.py new file mode 100644 index 00000000..94023696 --- /dev/null +++ b/projects/web page summation/app.py @@ -0,0 +1,146 @@ +#!/usr/bin/python +from tempfile import NamedTemporaryFile +from utils.summarize import summarize +import csv +import json +import shutil +import os +import textwrap +import logging +import signal +import argparse +import sys +import getopt + + +def parse_args(argv): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent('''\ + A command line utility for website summarization. + ----------------------------------------------- + These are common commands for this app.''')) + parser.add_argument( + 'action', + help='This action should be summarize') + parser.add_argument( + '--url', + help='A link to the website url' + ) + parser.add_argument( + '--sentence', + help='Argument to define number of sentence for the summary', + type=int, + default=2) + parser.add_argument( + '--language', + help='Argument to define language of the summary', + default='English') + parser.add_argument( + '--path', + help='path to csv file') + + return parser.parse_args(argv[1:]) + + +def readCsv(path): + print('\n\n Processing Csv file \n\n') + sys.stdout.flush() + data = [] + try: + with open(path, 'r') as userFile: + userFileReader = csv.reader(userFile) + for row in userFileReader: + data.append(row) + except: + with open(path, 'r', encoding="mbcs") as userFile: + userFileReader = csv.reader(userFile) + for row in userFileReader: + data.append(row) + return data + + +def writeCsv(data, LANGUAGE, SENTENCES_COUNT): + print('\n\n Updating Csv file \n\n') + sys.stdout.flush() + with open('beneficiary.csv', 'w') as newFile: + newFileWriter = csv.writer(newFile) + length = len(data) + position = data[0].index('website') + for i in range(1, length): + if i is 1: + _data = data[0] + _data.append("summary") + newFileWriter.writerow(_data) + try: + __data = data[i] + summary = summarize( + (data[i][position]), LANGUAGE, SENTENCES_COUNT) + __data.append(summary) + newFileWriter.writerow(__data) + except: + print('\n\n Error Skipping line \n\n') + sys.stdout.flush() + + +def processCsv(path, LANGUAGE, SENTENCES_COUNT): + try: + print('\n\n Proessing Started \n\n') + sys.stdout.flush() + data = readCsv(path) + writeCsv(data, LANGUAGE, SENTENCES_COUNT) + except: + print('\n\n Invalid file in file path \n\n') + sys.stdout.flush() + + +def main(argv=sys.argv): + # Configure logging + logging.basicConfig(filename='applog.log', + filemode='w', + level=logging.INFO, + format='%(levelname)s:%(message)s') + args = parse_args(argv) + action = args.action + url = args.url + path = args.path + LANGUAGE = "english" if args.language is None else args.language + SENTENCES_COUNT = 2 if args.sentence is None else args.sentence + if action == 'bulk': + if path is None: + print( + '\n\n Invalid Entry!, please Ensure you enter a valid file path \n\n') + sys.stdout.flush() + return + # guide against errors + try: + processCsv(path, LANGUAGE, SENTENCES_COUNT) + except: + print( + '\n\n Invalid Entry!, please Ensure you enter a valid file path \n\n') + sys.stdout.flush() + print('Completed') + sys.stdout.flush() + if os.path.isfile('beneficiary.csv'): + return shutil.move('beneficiary.csv', path) + return + if action == 'simple': + # guide against errors + try: + summary = summarize(url, LANGUAGE, SENTENCES_COUNT) + except: + print( + '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n') + sys.stdout.flush() + print('Completed') + sys.stdout.flush() + else: + print( + '\nAction command is not supported\n for help: run python3 app.py -h' + ) + sys.stdout.flush() + return + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/projects/web page summation/requirements.txt b/projects/web page summation/requirements.txt new file mode 100644 index 00000000..dffb29c5 --- /dev/null +++ b/projects/web page summation/requirements.txt @@ -0,0 +1,5 @@ +sumy + +nltk +numpy +argparse diff --git a/projects/web page summation/utils/__init__.py b/projects/web page summation/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/projects/web page summation/utils/comparison.py b/projects/web page summation/utils/comparison.py new file mode 100644 index 00000000..322f9739 --- /dev/null +++ b/projects/web page summation/utils/comparison.py @@ -0,0 +1,50 @@ + +# https://github.com/chakki-works/sumeval +# https://github.com/Tian312/awesome-text-summarization + +from sumeval.metrics.rouge import RougeCalculator +from sumeval.metrics.bleu import BLEUCalculator + + +def eval_rouges(refrence_summary, model_summary): + # refrence_summary = "tokyo shares close up #.## percent" + # model_summary = "tokyo stocks close up # percent to fresh record high" + + rouge = RougeCalculator(stopwords=True, lang="en") + + rouge_1 = rouge.rouge_n( + summary=model_summary, + references=refrence_summary, + n=1) + + rouge_2 = rouge.rouge_n( + summary=model_summary, + references=[refrence_summary], + n=2) + + rouge_l = rouge.rouge_l( + summary=model_summary, + references=[refrence_summary]) + + # You need spaCy to calculate ROUGE-BE + + rouge_be = rouge.rouge_be( + summary=model_summary, + references=[refrence_summary]) + + bleu = BLEUCalculator() + bleu_score = bleu.bleu(summary=model_summary, + references=[refrence_summary]) + + # print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format( + # rouge_1, rouge_2, rouge_l, rouge_be + # ).replace(", ", "\n")) + + return rouge_1, rouge_2, rouge_l, rouge_be, bleu_score + +# rouge_1, rouge_2,rouge_l,rouge_be = eval_rouges( "tokyo shares close up #.## percent", +# "tokyo stocks close up # percent to fresh record high") +# +# print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format( +# rouge_1, rouge_2, rouge_l, rouge_be +# ).replace(", ", "\n")) diff --git a/projects/web page summation/utils/model.py b/projects/web page summation/utils/model.py new file mode 100644 index 00000000..b3b7d508 --- /dev/null +++ b/projects/web page summation/utils/model.py @@ -0,0 +1,131 @@ +import tensorflow as tf +from tensorflow.contrib import rnn +#from utils import get_init_embedding + + +class Model(object): + def __init__(self, reversed_dict, article_max_len, summary_max_len, args, forward_only=False): + self.vocabulary_size = len(reversed_dict) + self.embedding_size = args.embedding_size + self.num_hidden = args.num_hidden + self.num_layers = args.num_layers + self.learning_rate = args.learning_rate + self.beam_width = args.beam_width + if not forward_only: + self.keep_prob = args.keep_prob + else: + self.keep_prob = 1.0 + self.cell = tf.nn.rnn_cell.BasicLSTMCell + with tf.variable_scope("decoder/projection"): + self.projection_layer = tf.layers.Dense( + self.vocabulary_size, use_bias=False) + + self.batch_size = tf.placeholder(tf.int32, (), name="batch_size") + self.X = tf.placeholder(tf.int32, [None, article_max_len]) + self.X_len = tf.placeholder(tf.int32, [None]) + self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len]) + self.decoder_len = tf.placeholder(tf.int32, [None]) + self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len]) + self.global_step = tf.Variable(0, trainable=False) + + with tf.name_scope("embedding"): + if not forward_only and args.glove: + init_embeddings = tf.constant(get_init_embedding( + reversed_dict, self.embedding_size), dtype=tf.float32) + else: + init_embeddings = tf.random_uniform( + [self.vocabulary_size, self.embedding_size], -1.0, 1.0) + self.embeddings = tf.get_variable( + "embeddings", initializer=init_embeddings) + self.encoder_emb_inp = tf.transpose( + tf.nn.embedding_lookup(self.embeddings, self.X), perm=[1, 0, 2]) + self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup( + self.embeddings, self.decoder_input), perm=[1, 0, 2]) + + with tf.name_scope("encoder"): + fw_cells = [self.cell(self.num_hidden) + for _ in range(self.num_layers)] + bw_cells = [self.cell(self.num_hidden) + for _ in range(self.num_layers)] + fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells] + bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells] + + encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( + fw_cells, bw_cells, self.encoder_emb_inp, + sequence_length=self.X_len, time_major=True, dtype=tf.float32) + self.encoder_output = tf.concat(encoder_outputs, 2) + encoder_state_c = tf.concat( + (encoder_state_fw[0].c, encoder_state_bw[0].c), 1) + encoder_state_h = tf.concat( + (encoder_state_fw[0].h, encoder_state_bw[0].h), 1) + self.encoder_state = rnn.LSTMStateTuple( + c=encoder_state_c, h=encoder_state_h) + + with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope: + decoder_cell = self.cell(self.num_hidden * 2) + + if not forward_only: + attention_states = tf.transpose(self.encoder_output, [1, 0, 2]) + attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( + self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True) + decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, + attention_layer_size=self.num_hidden * 2) + initial_state = decoder_cell.zero_state( + dtype=tf.float32, batch_size=self.batch_size) + initial_state = initial_state.clone( + cell_state=self.encoder_state) + helper = tf.contrib.seq2seq.TrainingHelper( + self.decoder_emb_inp, self.decoder_len, time_major=True) + decoder = tf.contrib.seq2seq.BasicDecoder( + decoder_cell, helper, initial_state) + outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( + decoder, output_time_major=True, scope=decoder_scope) + self.decoder_output = outputs.rnn_output + self.logits = tf.transpose( + self.projection_layer(self.decoder_output), perm=[1, 0, 2]) + self.logits_reshape = tf.concat( + [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1) + else: + tiled_encoder_output = tf.contrib.seq2seq.tile_batch( + tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width) + tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch( + self.encoder_state, multiplier=self.beam_width) + tiled_seq_len = tf.contrib.seq2seq.tile_batch( + self.X_len, multiplier=self.beam_width) + attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( + self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True) + decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, + attention_layer_size=self.num_hidden * 2) + initial_state = decoder_cell.zero_state( + dtype=tf.float32, batch_size=self.batch_size * self.beam_width) + initial_state = initial_state.clone( + cell_state=tiled_encoder_final_state) + decoder = tf.contrib.seq2seq.BeamSearchDecoder( + cell=decoder_cell, + embedding=self.embeddings, + start_tokens=tf.fill([self.batch_size], tf.constant(2)), + end_token=tf.constant(3), + initial_state=initial_state, + beam_width=self.beam_width, + output_layer=self.projection_layer + ) + outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( + decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope) + self.prediction = tf.transpose( + outputs.predicted_ids, perm=[1, 2, 0]) + + with tf.name_scope("loss"): + if not forward_only: + crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=self.logits_reshape, labels=self.decoder_target) + weights = tf.sequence_mask( + self.decoder_len, summary_max_len, dtype=tf.float32) + self.loss = tf.reduce_sum( + crossent * weights / tf.to_float(self.batch_size)) + + params = tf.trainable_variables() + gradients = tf.gradients(self.loss, params) + clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) + optimizer = tf.train.AdamOptimizer(self.learning_rate) + self.update = optimizer.apply_gradients( + zip(clipped_gradients, params), global_step=self.global_step) diff --git a/projects/web page summation/utils/prepare.py b/projects/web page summation/utils/prepare.py new file mode 100644 index 00000000..12854503 --- /dev/null +++ b/projects/web page summation/utils/prepare.py @@ -0,0 +1,39 @@ + +import wget +import os +import tarfile +import gzip +import zipfile +import argparse + + +#parser = argparse.ArgumentParser() +#parser.add_argument("--glove", action="store_true") +#args = parser.parse_args() + +# Extract data file +#with tarfile.open(default_path + "sumdata/train/summary.tar.gz", "r:gz") as tar: +# tar.extractall() + +with gzip.open(default_path + "sumdata/train/train.article.txt.gz", "rb") as gz: + with open(default_path + "sumdata/train/train.article.txt", "wb") as out: + out.write(gz.read()) + +with gzip.open(default_path + "sumdata/train/train.title.txt.gz", "rb") as gz: + with open(default_path + "sumdata/train/train.title.txt", "wb") as out: + out.write(gz.read()) + + +#if args.glove: +# glove_dir = "glove" +# glove_url = "https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip" +# +# if not os.path.exists(glove_dir): +# os.mkdir(glove_dir) +# +# # Download glove vector +# wget.download(glove_url, out=glove_dir) +# +# # Extract glove file +# with zipfile.ZipFile(os.path.join("glove", "glove.42B.300d.zip"), "r") as z: +# z.extractall(glove_dir) \ No newline at end of file diff --git a/projects/web page summation/utils/summarize.py b/projects/web page summation/utils/summarize.py new file mode 100644 index 00000000..74243944 --- /dev/null +++ b/projects/web page summation/utils/summarize.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# load Dependancies + +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals +from sumy.parsers.html import HtmlParser +from sumy.parsers.plaintext import PlaintextParser +from sumy.nlp.tokenizers import Tokenizer +from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer +from sumy.nlp.stemmers import Stemmer +from sumy.utils import get_stop_words +import sys + + +def summarize(url=None, LANGUAGE='English', SENTENCES_COUNT=2): + parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) + # or for plain text files + # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) + stemmer = Stemmer(LANGUAGE) + + summarizer = Summarizer(stemmer) + summarizer.stop_words = get_stop_words(LANGUAGE) + result = '' + for sentence in summarizer(parser.document, SENTENCES_COUNT): + result = result + ' ' + str(sentence) + try: + result = result + ' ' + str(sentence) + + except: + print( + '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n') + sys.stdout.flush() + return ( + '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n') + print('\n\n'+str(url)+'\n\n'+str(result)) + sys.stdout.flush() + return result diff --git a/projects/web page summation/utils/test.py b/projects/web page summation/utils/test.py new file mode 100644 index 00000000..bfcdca62 --- /dev/null +++ b/projects/web page summation/utils/test.py @@ -0,0 +1,79 @@ +import tensorflow as tf +import pickle +#from model import Model +#from utils import build_dict, build_dataset, batch_iter + + +# with open("args.pickle", "rb") as f: +# args = pickle.load(f) + +tf.reset_default_graph() + + +class args: + pass + + +args.num_hidden = 150 +args.num_layers = 2 +args.beam_width = 10 +args.glove = "store_true" +args.embedding_size = 300 + +args.learning_rate = 1e-3 +args.batch_size = 64 +args.num_epochs = 10 +args.keep_prob = 0.8 + +args.toy = True + +args.with_model = "store_true" + + +print("Loading dictionary...") +word_dict, reversed_dict, article_max_len, summary_max_len = build_dict( + "valid", args.toy) +print("Loading validation dataset...") +valid_x = build_dataset( + "valid", word_dict, article_max_len, summary_max_len, args.toy) +valid_x_len = [len([y for y in x if y != 0]) for x in valid_x] +print("Loading article and reference...") +article = get_text_list(valid_article_path, args.toy) +reference = get_text_list(valid_title_path, args.toy) + +with tf.Session() as sess: + print("Loading saved model...") + model = Model(reversed_dict, article_max_len, + summary_max_len, args, forward_only=True) + saver = tf.train.Saver(tf.global_variables()) + ckpt = tf.train.get_checkpoint_state(default_path + "saved_model/") + saver.restore(sess, ckpt.model_checkpoint_path) + + batches = batch_iter(valid_x, [0] * len(valid_x), args.batch_size, 1) + + print("Writing summaries to 'result.txt'...") + for batch_x, _ in batches: + batch_x_len = [len([y for y in x if y != 0]) for x in batch_x] + + valid_feed_dict = { + model.batch_size: len(batch_x), + model.X: batch_x, + model.X_len: batch_x_len, + } + + prediction = sess.run(model.prediction, feed_dict=valid_feed_dict) + prediction_output = [[reversed_dict[y] + for y in x] for x in prediction[:, 0, :]] + summary_array = [] + with open(default_path + "result.txt", "a") as f: + for line in prediction_output: + summary = list() + for word in line: + if word == "": + break + if word not in summary: + summary.append(word) + summary_array.append(" ".join(summary)) + #print(" ".join(summary), file=f) + + print('Summaries have been generated') diff --git a/projects/web page summation/utils/train.py b/projects/web page summation/utils/train.py new file mode 100644 index 00000000..d1a8d32f --- /dev/null +++ b/projects/web page summation/utils/train.py @@ -0,0 +1,126 @@ +import os +import pickle +import argparse +import tensorflow as tf +import time +start = time.perf_counter() +#from model import Model +#from utils import build_dict, build_dataset, batch_iter + +# Uncomment next 2 lines to suppress error and Tensorflow info verbosity. Or change logging levels +# tf.logging.set_verbosity(tf.logging.FATAL) +# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +# def add_arguments(parser): +# parser.add_argument("--num_hidden", type=int, default=150, help="Network size.") +# parser.add_argument("--num_layers", type=int, default=2, help="Network depth.") +# parser.add_argument("--beam_width", type=int, default=10, help="Beam width for beam search decoder.") +# parser.add_argument("--glove", action="store_true", help="Use glove as initial word embedding.") +# parser.add_argument("--embedding_size", type=int, default=300, help="Word embedding size.") +# +# parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate.") +# parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") +# parser.add_argument("--num_epochs", type=int, default=10, help="Number of epochs.") +# parser.add_argument("--keep_prob", type=float, default=0.8, help="Dropout keep prob.") +# +# parser.add_argument("--toy", action="store_true", help="Use only 50K samples of data") +# +# parser.add_argument("--with_model", action="store_true", help="Continue from previously saved model") + + +class args: + pass + + +args.num_hidden = 150 +args.num_layers = 2 +args.beam_width = 10 +args.glove = "store_true" +args.embedding_size = 300 + +args.learning_rate = 1e-3 +args.batch_size = 64 +args.num_epochs = 10 +args.keep_prob = 0.8 + +args.toy = False # "store_true" + +args.with_model = "store_true" + + +#parser = argparse.ArgumentParser() +# add_arguments(parser) +#args = parser.parse_args() +# with open("args.pickle", "wb") as f: +# pickle.dump(args, f) + +if not os.path.exists(default_path + "saved_model"): + os.mkdir(default_path + "saved_model") +else: + # if args.with_model: + old_model_checkpoint_path = open( + default_path + 'saved_model/checkpoint', 'r') + old_model_checkpoint_path = "".join( + [default_path + "saved_model/", old_model_checkpoint_path.read().splitlines()[0].split('"')[1]]) + + +print("Building dictionary...") +word_dict, reversed_dict, article_max_len, summary_max_len = build_dict( + "train", args.toy) +print("Loading training dataset...") +train_x, train_y = build_dataset( + "train", word_dict, article_max_len, summary_max_len, args.toy) + +tf.reset_default_graph() + +with tf.Session() as sess: + model = Model(reversed_dict, article_max_len, summary_max_len, args) + sess.run(tf.global_variables_initializer()) + saver = tf.train.Saver(tf.global_variables()) + if 'old_model_checkpoint_path' in globals(): + print("Continuing from previous trained model:", + old_model_checkpoint_path, "...") + saver.restore(sess, old_model_checkpoint_path) + + batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs) + num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1 + + print("\nIteration starts.") + print("Number of batches per epoch :", num_batches_per_epoch) + for batch_x, batch_y in batches: + batch_x_len = list( + map(lambda x: len([y for y in x if y != 0]), batch_x)) + batch_decoder_input = list( + map(lambda x: [word_dict[""]] + list(x), batch_y)) + batch_decoder_len = list( + map(lambda x: len([y for y in x if y != 0]), batch_decoder_input)) + batch_decoder_output = list( + map(lambda x: list(x) + [word_dict[""]], batch_y)) + + batch_decoder_input = list( + map(lambda d: d + (summary_max_len - len(d)) * [word_dict[""]], batch_decoder_input)) + batch_decoder_output = list( + map(lambda d: d + (summary_max_len - len(d)) * [word_dict[""]], batch_decoder_output)) + + train_feed_dict = { + model.batch_size: len(batch_x), + model.X: batch_x, + model.X_len: batch_x_len, + model.decoder_input: batch_decoder_input, + model.decoder_len: batch_decoder_len, + model.decoder_target: batch_decoder_output + } + + _, step, loss = sess.run( + [model.update, model.global_step, model.loss], feed_dict=train_feed_dict) + + if step % 1000 == 0: + print("step {0}: loss = {1}".format(step, loss)) + + if step % num_batches_per_epoch == 0: + hours, rem = divmod(time.perf_counter() - start, 3600) + minutes, seconds = divmod(rem, 60) + saver.save(sess, default_path + + "saved_model/model.ckpt", global_step=step) + print(" Epoch {0}: Model is saved.".format(step // num_batches_per_epoch), + "Elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds), "\n") diff --git a/projects/web page summation/utils/utils.py b/projects/web page summation/utils/utils.py new file mode 100644 index 00000000..86c492ab --- /dev/null +++ b/projects/web page summation/utils/utils.py @@ -0,0 +1,116 @@ +import re +import collections +import pickle +import numpy as np +from newspaper import Article +from nltk.tokenize import word_tokenize +from gensim.models.keyedvectors import KeyedVectors +from gensim.test.utils import get_tmpfile +from gensim.scripts.glove2word2vec import glove2word2vec + +def clean_str(sentence): + sentence = re.sub("[#.]+", "#", sentence) + return sentence + + +def get_text_list(data_path, toy): + with open(data_path, "r", encoding="utf-8") as f: + if not toy: + return [clean_str(x.strip()) for x in f.readlines()][:200000] + else: + return [clean_str(x.strip()) for x in f.readlines()][:50] + + +def build_dict(step, toy=False): + if step == "train": + train_article_list = get_text_list(train_article_path, toy) + train_title_list = get_text_list(train_title_path, toy) + + words = list() + for sentence in train_article_list + train_title_list: + for word in word_tokenize(sentence): + words.append(word) + + word_counter = collections.Counter(words).most_common() + word_dict = dict() + word_dict[""] = 0 + word_dict[""] = 1 + word_dict[""] = 2 + word_dict[""] = 3 + for word, _ in word_counter: + word_dict[word] = len(word_dict) + + with open(default_path + "word_dict.pickle", "wb") as f: + pickle.dump(word_dict, f) + + elif step == "valid": + with open(default_path + "word_dict.pickle", "rb") as f: + word_dict = pickle.load(f) + + reversed_dict = dict(zip(word_dict.values(), word_dict.keys())) + + article_max_len = 50 + summary_max_len = 15 + + return word_dict, reversed_dict, article_max_len, summary_max_len + + +def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False): + if step == "train": + article_list = get_text_list(train_article_path, toy) + title_list = get_text_list(train_title_path, toy) + elif step == "valid": + article_list = get_text_list(valid_article_path, toy) + else: + raise NotImplementedError + + x = [word_tokenize(d) for d in article_list] + x = [[word_dict.get(w, word_dict[""]) for w in d] for d in x] + x = [d[:article_max_len] for d in x] + x = [d + (article_max_len - len(d)) * [word_dict[""]] for d in x] + + if step == "valid": + return x + else: + y = [word_tokenize(d) for d in title_list] + y = [[word_dict.get(w, word_dict[""]) for w in d] for d in y] + y = [d[:(summary_max_len - 1)] for d in y] + return x, y + + +def batch_iter(inputs, outputs, batch_size, num_epochs): + inputs = np.array(inputs) + outputs = np.array(outputs) + + num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1 + for epoch in range(num_epochs): + for batch_num in range(num_batches_per_epoch): + start_index = batch_num * batch_size + end_index = min((batch_num + 1) * batch_size, len(inputs)) + yield inputs[start_index:end_index], outputs[start_index:end_index] + + +def get_init_embedding(reversed_dict, embedding_size): + #glove_file = default_path + "glove/glove.6B.300d.txt" + #word2vec_file = get_tmpfile(default_path + "word2vec_format.vec") + #glove2word2vec(glove_file, word2vec_file) + print("Loading Glove vectors...") + #word_vectors = KeyedVectors.load_word2vec_format(word2vec_file) + + with open(default_path + "glove/model_glove_300.pkl", 'rb') as handle: + word_vectors = pickle.load(handle) + + word_vec_list = list() + for _, word in sorted(reversed_dict.items()): + try: + word_vec = word_vectors.word_vec(word) + except KeyError: + word_vec = np.zeros([embedding_size], dtype=np.float32) + + word_vec_list.append(word_vec) + + # Assign random vector to , token + word_vec_list[2] = np.random.normal(0, 1, embedding_size) + word_vec_list[3] = np.random.normal(0, 1, embedding_size) + + return np.array(word_vec_list)