diff --git a/projects/web page summation/.gitignore b/projects/web page summation/.gitignore
new file mode 100644
index 00000000..00c7b95b
--- /dev/null
+++ b/projects/web page summation/.gitignore
@@ -0,0 +1,138 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+venv/
+env.bak/
+venv.bak/
+env/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+__pycache__
+.vscode
+settings.json
+
+Dependency directories
+node_modules/
+jspm_packages/
+
+# Optional npm cache directory
+.npm
+.DS_Store
+.DS_Store
+datasets
+datasets/
+new_datasets/
+node_modules
+yarn.lock
+app
+__pycache__/
+dist
+build
+mlclassification-darwin-x64
+release-builds
+Classifi
+app
+dist
+build
+Summarize.spec
+__pycache__
+applog.log
+csv/
+beneficiary.csv
+.DS_Store
+applog.log
diff --git a/projects/web page summation/README.md b/projects/web page summation/README.md
new file mode 100644
index 00000000..dab9f8ee
--- /dev/null
+++ b/projects/web page summation/README.md
@@ -0,0 +1,84 @@
+# Website Summarization API
+
+This project is carried out for the purpose of building a machine learning model for summarising a website from urls;
+
+## Getting Started
+
+These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
+
+
+### Prerequisites
+
+Python distribution
+
+```
+Anaconda
+```
+
+### Installing
+
+Install Anaconda python distribution on your system
+
+Create a virtual environment called env.
+
+```
+python -m venv app
+```
+
+Activate the virtual environment
+
+```
+LINUX/Mac: source app/bin/activate
+
+Windows: app\Scripts\activate
+```
+
+Upgrade to the latest pip
+
+```
+pip install --upgrade pip
+```
+
+Install dependencies using requirements file
+
+```
+pip install -r requirements.txt
+```
+**Note: Your virtual environment must always be activated before running any command**
+
+## Deployment
+
+Start app (Make sure to enter a valid website to an existing website)
+
+
+Example of valid commands
+
+```
+python app.py simple --url https://facebook.com --sentence 1 --language english
+python app.py simple --url https://facebook.com
+python app.py simple --url https://korapay.com
+python app.py bulk --path ./csv/valid_websites.csv
+```
+
+
+### APIs
+
+This are command options in full:
+
+```
+A command line utility for website Summarization.
+-----------------------------------------------
+These are common commands for this app.
+
+positional arguments:
+ action This has to be 'summarize'
+
+optional arguments:
+ -h, --help show this help message and exit
+ --website PATH website of the url to be summarised
+
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE.md) file for details
+
diff --git a/projects/web page summation/app.py b/projects/web page summation/app.py
new file mode 100644
index 00000000..94023696
--- /dev/null
+++ b/projects/web page summation/app.py
@@ -0,0 +1,146 @@
+#!/usr/bin/python
+from tempfile import NamedTemporaryFile
+from utils.summarize import summarize
+import csv
+import json
+import shutil
+import os
+import textwrap
+import logging
+import signal
+import argparse
+import sys
+import getopt
+
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=textwrap.dedent('''\
+ A command line utility for website summarization.
+ -----------------------------------------------
+ These are common commands for this app.'''))
+ parser.add_argument(
+ 'action',
+ help='This action should be summarize')
+ parser.add_argument(
+ '--url',
+ help='A link to the website url'
+ )
+ parser.add_argument(
+ '--sentence',
+ help='Argument to define number of sentence for the summary',
+ type=int,
+ default=2)
+ parser.add_argument(
+ '--language',
+ help='Argument to define language of the summary',
+ default='English')
+ parser.add_argument(
+ '--path',
+ help='path to csv file')
+
+ return parser.parse_args(argv[1:])
+
+
+def readCsv(path):
+ print('\n\n Processing Csv file \n\n')
+ sys.stdout.flush()
+ data = []
+ try:
+ with open(path, 'r') as userFile:
+ userFileReader = csv.reader(userFile)
+ for row in userFileReader:
+ data.append(row)
+ except:
+ with open(path, 'r', encoding="mbcs") as userFile:
+ userFileReader = csv.reader(userFile)
+ for row in userFileReader:
+ data.append(row)
+ return data
+
+
+def writeCsv(data, LANGUAGE, SENTENCES_COUNT):
+ print('\n\n Updating Csv file \n\n')
+ sys.stdout.flush()
+ with open('beneficiary.csv', 'w') as newFile:
+ newFileWriter = csv.writer(newFile)
+ length = len(data)
+ position = data[0].index('website')
+ for i in range(1, length):
+ if i is 1:
+ _data = data[0]
+ _data.append("summary")
+ newFileWriter.writerow(_data)
+ try:
+ __data = data[i]
+ summary = summarize(
+ (data[i][position]), LANGUAGE, SENTENCES_COUNT)
+ __data.append(summary)
+ newFileWriter.writerow(__data)
+ except:
+ print('\n\n Error Skipping line \n\n')
+ sys.stdout.flush()
+
+
+def processCsv(path, LANGUAGE, SENTENCES_COUNT):
+ try:
+ print('\n\n Proessing Started \n\n')
+ sys.stdout.flush()
+ data = readCsv(path)
+ writeCsv(data, LANGUAGE, SENTENCES_COUNT)
+ except:
+ print('\n\n Invalid file in file path \n\n')
+ sys.stdout.flush()
+
+
+def main(argv=sys.argv):
+ # Configure logging
+ logging.basicConfig(filename='applog.log',
+ filemode='w',
+ level=logging.INFO,
+ format='%(levelname)s:%(message)s')
+ args = parse_args(argv)
+ action = args.action
+ url = args.url
+ path = args.path
+ LANGUAGE = "english" if args.language is None else args.language
+ SENTENCES_COUNT = 2 if args.sentence is None else args.sentence
+ if action == 'bulk':
+ if path is None:
+ print(
+ '\n\n Invalid Entry!, please Ensure you enter a valid file path \n\n')
+ sys.stdout.flush()
+ return
+ # guide against errors
+ try:
+ processCsv(path, LANGUAGE, SENTENCES_COUNT)
+ except:
+ print(
+ '\n\n Invalid Entry!, please Ensure you enter a valid file path \n\n')
+ sys.stdout.flush()
+ print('Completed')
+ sys.stdout.flush()
+ if os.path.isfile('beneficiary.csv'):
+ return shutil.move('beneficiary.csv', path)
+ return
+ if action == 'simple':
+ # guide against errors
+ try:
+ summary = summarize(url, LANGUAGE, SENTENCES_COUNT)
+ except:
+ print(
+ '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n')
+ sys.stdout.flush()
+ print('Completed')
+ sys.stdout.flush()
+ else:
+ print(
+ '\nAction command is not supported\n for help: run python3 app.py -h'
+ )
+ sys.stdout.flush()
+ return
+
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/projects/web page summation/requirements.txt b/projects/web page summation/requirements.txt
new file mode 100644
index 00000000..dffb29c5
--- /dev/null
+++ b/projects/web page summation/requirements.txt
@@ -0,0 +1,5 @@
+sumy
+
+nltk
+numpy
+argparse
diff --git a/projects/web page summation/utils/__init__.py b/projects/web page summation/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/projects/web page summation/utils/comparison.py b/projects/web page summation/utils/comparison.py
new file mode 100644
index 00000000..322f9739
--- /dev/null
+++ b/projects/web page summation/utils/comparison.py
@@ -0,0 +1,50 @@
+
+# https://github.com/chakki-works/sumeval
+# https://github.com/Tian312/awesome-text-summarization
+
+from sumeval.metrics.rouge import RougeCalculator
+from sumeval.metrics.bleu import BLEUCalculator
+
+
+def eval_rouges(refrence_summary, model_summary):
+ # refrence_summary = "tokyo shares close up #.## percent"
+ # model_summary = "tokyo stocks close up # percent to fresh record high"
+
+ rouge = RougeCalculator(stopwords=True, lang="en")
+
+ rouge_1 = rouge.rouge_n(
+ summary=model_summary,
+ references=refrence_summary,
+ n=1)
+
+ rouge_2 = rouge.rouge_n(
+ summary=model_summary,
+ references=[refrence_summary],
+ n=2)
+
+ rouge_l = rouge.rouge_l(
+ summary=model_summary,
+ references=[refrence_summary])
+
+ # You need spaCy to calculate ROUGE-BE
+
+ rouge_be = rouge.rouge_be(
+ summary=model_summary,
+ references=[refrence_summary])
+
+ bleu = BLEUCalculator()
+ bleu_score = bleu.bleu(summary=model_summary,
+ references=[refrence_summary])
+
+ # print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format(
+ # rouge_1, rouge_2, rouge_l, rouge_be
+ # ).replace(", ", "\n"))
+
+ return rouge_1, rouge_2, rouge_l, rouge_be, bleu_score
+
+# rouge_1, rouge_2,rouge_l,rouge_be = eval_rouges( "tokyo shares close up #.## percent",
+# "tokyo stocks close up # percent to fresh record high")
+#
+# print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format(
+# rouge_1, rouge_2, rouge_l, rouge_be
+# ).replace(", ", "\n"))
diff --git a/projects/web page summation/utils/model.py b/projects/web page summation/utils/model.py
new file mode 100644
index 00000000..b3b7d508
--- /dev/null
+++ b/projects/web page summation/utils/model.py
@@ -0,0 +1,131 @@
+import tensorflow as tf
+from tensorflow.contrib import rnn
+#from utils import get_init_embedding
+
+
+class Model(object):
+ def __init__(self, reversed_dict, article_max_len, summary_max_len, args, forward_only=False):
+ self.vocabulary_size = len(reversed_dict)
+ self.embedding_size = args.embedding_size
+ self.num_hidden = args.num_hidden
+ self.num_layers = args.num_layers
+ self.learning_rate = args.learning_rate
+ self.beam_width = args.beam_width
+ if not forward_only:
+ self.keep_prob = args.keep_prob
+ else:
+ self.keep_prob = 1.0
+ self.cell = tf.nn.rnn_cell.BasicLSTMCell
+ with tf.variable_scope("decoder/projection"):
+ self.projection_layer = tf.layers.Dense(
+ self.vocabulary_size, use_bias=False)
+
+ self.batch_size = tf.placeholder(tf.int32, (), name="batch_size")
+ self.X = tf.placeholder(tf.int32, [None, article_max_len])
+ self.X_len = tf.placeholder(tf.int32, [None])
+ self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len])
+ self.decoder_len = tf.placeholder(tf.int32, [None])
+ self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len])
+ self.global_step = tf.Variable(0, trainable=False)
+
+ with tf.name_scope("embedding"):
+ if not forward_only and args.glove:
+ init_embeddings = tf.constant(get_init_embedding(
+ reversed_dict, self.embedding_size), dtype=tf.float32)
+ else:
+ init_embeddings = tf.random_uniform(
+ [self.vocabulary_size, self.embedding_size], -1.0, 1.0)
+ self.embeddings = tf.get_variable(
+ "embeddings", initializer=init_embeddings)
+ self.encoder_emb_inp = tf.transpose(
+ tf.nn.embedding_lookup(self.embeddings, self.X), perm=[1, 0, 2])
+ self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup(
+ self.embeddings, self.decoder_input), perm=[1, 0, 2])
+
+ with tf.name_scope("encoder"):
+ fw_cells = [self.cell(self.num_hidden)
+ for _ in range(self.num_layers)]
+ bw_cells = [self.cell(self.num_hidden)
+ for _ in range(self.num_layers)]
+ fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells]
+ bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells]
+
+ encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
+ fw_cells, bw_cells, self.encoder_emb_inp,
+ sequence_length=self.X_len, time_major=True, dtype=tf.float32)
+ self.encoder_output = tf.concat(encoder_outputs, 2)
+ encoder_state_c = tf.concat(
+ (encoder_state_fw[0].c, encoder_state_bw[0].c), 1)
+ encoder_state_h = tf.concat(
+ (encoder_state_fw[0].h, encoder_state_bw[0].h), 1)
+ self.encoder_state = rnn.LSTMStateTuple(
+ c=encoder_state_c, h=encoder_state_h)
+
+ with tf.name_scope("decoder"), tf.variable_scope("decoder") as decoder_scope:
+ decoder_cell = self.cell(self.num_hidden * 2)
+
+ if not forward_only:
+ attention_states = tf.transpose(self.encoder_output, [1, 0, 2])
+ attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
+ self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True)
+ decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
+ attention_layer_size=self.num_hidden * 2)
+ initial_state = decoder_cell.zero_state(
+ dtype=tf.float32, batch_size=self.batch_size)
+ initial_state = initial_state.clone(
+ cell_state=self.encoder_state)
+ helper = tf.contrib.seq2seq.TrainingHelper(
+ self.decoder_emb_inp, self.decoder_len, time_major=True)
+ decoder = tf.contrib.seq2seq.BasicDecoder(
+ decoder_cell, helper, initial_state)
+ outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
+ decoder, output_time_major=True, scope=decoder_scope)
+ self.decoder_output = outputs.rnn_output
+ self.logits = tf.transpose(
+ self.projection_layer(self.decoder_output), perm=[1, 0, 2])
+ self.logits_reshape = tf.concat(
+ [self.logits, tf.zeros([self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size])], axis=1)
+ else:
+ tiled_encoder_output = tf.contrib.seq2seq.tile_batch(
+ tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width)
+ tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
+ self.encoder_state, multiplier=self.beam_width)
+ tiled_seq_len = tf.contrib.seq2seq.tile_batch(
+ self.X_len, multiplier=self.beam_width)
+ attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
+ self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True)
+ decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
+ attention_layer_size=self.num_hidden * 2)
+ initial_state = decoder_cell.zero_state(
+ dtype=tf.float32, batch_size=self.batch_size * self.beam_width)
+ initial_state = initial_state.clone(
+ cell_state=tiled_encoder_final_state)
+ decoder = tf.contrib.seq2seq.BeamSearchDecoder(
+ cell=decoder_cell,
+ embedding=self.embeddings,
+ start_tokens=tf.fill([self.batch_size], tf.constant(2)),
+ end_token=tf.constant(3),
+ initial_state=initial_state,
+ beam_width=self.beam_width,
+ output_layer=self.projection_layer
+ )
+ outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
+ decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope)
+ self.prediction = tf.transpose(
+ outputs.predicted_ids, perm=[1, 2, 0])
+
+ with tf.name_scope("loss"):
+ if not forward_only:
+ crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
+ logits=self.logits_reshape, labels=self.decoder_target)
+ weights = tf.sequence_mask(
+ self.decoder_len, summary_max_len, dtype=tf.float32)
+ self.loss = tf.reduce_sum(
+ crossent * weights / tf.to_float(self.batch_size))
+
+ params = tf.trainable_variables()
+ gradients = tf.gradients(self.loss, params)
+ clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
+ optimizer = tf.train.AdamOptimizer(self.learning_rate)
+ self.update = optimizer.apply_gradients(
+ zip(clipped_gradients, params), global_step=self.global_step)
diff --git a/projects/web page summation/utils/prepare.py b/projects/web page summation/utils/prepare.py
new file mode 100644
index 00000000..12854503
--- /dev/null
+++ b/projects/web page summation/utils/prepare.py
@@ -0,0 +1,39 @@
+
+import wget
+import os
+import tarfile
+import gzip
+import zipfile
+import argparse
+
+
+#parser = argparse.ArgumentParser()
+#parser.add_argument("--glove", action="store_true")
+#args = parser.parse_args()
+
+# Extract data file
+#with tarfile.open(default_path + "sumdata/train/summary.tar.gz", "r:gz") as tar:
+# tar.extractall()
+
+with gzip.open(default_path + "sumdata/train/train.article.txt.gz", "rb") as gz:
+ with open(default_path + "sumdata/train/train.article.txt", "wb") as out:
+ out.write(gz.read())
+
+with gzip.open(default_path + "sumdata/train/train.title.txt.gz", "rb") as gz:
+ with open(default_path + "sumdata/train/train.title.txt", "wb") as out:
+ out.write(gz.read())
+
+
+#if args.glove:
+# glove_dir = "glove"
+# glove_url = "https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip"
+#
+# if not os.path.exists(glove_dir):
+# os.mkdir(glove_dir)
+#
+# # Download glove vector
+# wget.download(glove_url, out=glove_dir)
+#
+# # Extract glove file
+# with zipfile.ZipFile(os.path.join("glove", "glove.42B.300d.zip"), "r") as z:
+# z.extractall(glove_dir)
\ No newline at end of file
diff --git a/projects/web page summation/utils/summarize.py b/projects/web page summation/utils/summarize.py
new file mode 100644
index 00000000..74243944
--- /dev/null
+++ b/projects/web page summation/utils/summarize.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# load Dependancies
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+from sumy.parsers.html import HtmlParser
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
+from sumy.nlp.stemmers import Stemmer
+from sumy.utils import get_stop_words
+import sys
+
+
+def summarize(url=None, LANGUAGE='English', SENTENCES_COUNT=2):
+ parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
+ # or for plain text files
+ # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
+ stemmer = Stemmer(LANGUAGE)
+
+ summarizer = Summarizer(stemmer)
+ summarizer.stop_words = get_stop_words(LANGUAGE)
+ result = ''
+ for sentence in summarizer(parser.document, SENTENCES_COUNT):
+ result = result + ' ' + str(sentence)
+ try:
+ result = result + ' ' + str(sentence)
+
+ except:
+ print(
+ '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n')
+ sys.stdout.flush()
+ return (
+ '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n')
+ print('\n\n'+str(url)+'\n\n'+str(result))
+ sys.stdout.flush()
+ return result
diff --git a/projects/web page summation/utils/test.py b/projects/web page summation/utils/test.py
new file mode 100644
index 00000000..bfcdca62
--- /dev/null
+++ b/projects/web page summation/utils/test.py
@@ -0,0 +1,79 @@
+import tensorflow as tf
+import pickle
+#from model import Model
+#from utils import build_dict, build_dataset, batch_iter
+
+
+# with open("args.pickle", "rb") as f:
+# args = pickle.load(f)
+
+tf.reset_default_graph()
+
+
+class args:
+ pass
+
+
+args.num_hidden = 150
+args.num_layers = 2
+args.beam_width = 10
+args.glove = "store_true"
+args.embedding_size = 300
+
+args.learning_rate = 1e-3
+args.batch_size = 64
+args.num_epochs = 10
+args.keep_prob = 0.8
+
+args.toy = True
+
+args.with_model = "store_true"
+
+
+print("Loading dictionary...")
+word_dict, reversed_dict, article_max_len, summary_max_len = build_dict(
+ "valid", args.toy)
+print("Loading validation dataset...")
+valid_x = build_dataset(
+ "valid", word_dict, article_max_len, summary_max_len, args.toy)
+valid_x_len = [len([y for y in x if y != 0]) for x in valid_x]
+print("Loading article and reference...")
+article = get_text_list(valid_article_path, args.toy)
+reference = get_text_list(valid_title_path, args.toy)
+
+with tf.Session() as sess:
+ print("Loading saved model...")
+ model = Model(reversed_dict, article_max_len,
+ summary_max_len, args, forward_only=True)
+ saver = tf.train.Saver(tf.global_variables())
+ ckpt = tf.train.get_checkpoint_state(default_path + "saved_model/")
+ saver.restore(sess, ckpt.model_checkpoint_path)
+
+ batches = batch_iter(valid_x, [0] * len(valid_x), args.batch_size, 1)
+
+ print("Writing summaries to 'result.txt'...")
+ for batch_x, _ in batches:
+ batch_x_len = [len([y for y in x if y != 0]) for x in batch_x]
+
+ valid_feed_dict = {
+ model.batch_size: len(batch_x),
+ model.X: batch_x,
+ model.X_len: batch_x_len,
+ }
+
+ prediction = sess.run(model.prediction, feed_dict=valid_feed_dict)
+ prediction_output = [[reversed_dict[y]
+ for y in x] for x in prediction[:, 0, :]]
+ summary_array = []
+ with open(default_path + "result.txt", "a") as f:
+ for line in prediction_output:
+ summary = list()
+ for word in line:
+ if word == "":
+ break
+ if word not in summary:
+ summary.append(word)
+ summary_array.append(" ".join(summary))
+ #print(" ".join(summary), file=f)
+
+ print('Summaries have been generated')
diff --git a/projects/web page summation/utils/train.py b/projects/web page summation/utils/train.py
new file mode 100644
index 00000000..d1a8d32f
--- /dev/null
+++ b/projects/web page summation/utils/train.py
@@ -0,0 +1,126 @@
+import os
+import pickle
+import argparse
+import tensorflow as tf
+import time
+start = time.perf_counter()
+#from model import Model
+#from utils import build_dict, build_dataset, batch_iter
+
+# Uncomment next 2 lines to suppress error and Tensorflow info verbosity. Or change logging levels
+# tf.logging.set_verbosity(tf.logging.FATAL)
+# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+# def add_arguments(parser):
+# parser.add_argument("--num_hidden", type=int, default=150, help="Network size.")
+# parser.add_argument("--num_layers", type=int, default=2, help="Network depth.")
+# parser.add_argument("--beam_width", type=int, default=10, help="Beam width for beam search decoder.")
+# parser.add_argument("--glove", action="store_true", help="Use glove as initial word embedding.")
+# parser.add_argument("--embedding_size", type=int, default=300, help="Word embedding size.")
+#
+# parser.add_argument("--learning_rate", type=float, default=1e-3, help="Learning rate.")
+# parser.add_argument("--batch_size", type=int, default=64, help="Batch size.")
+# parser.add_argument("--num_epochs", type=int, default=10, help="Number of epochs.")
+# parser.add_argument("--keep_prob", type=float, default=0.8, help="Dropout keep prob.")
+#
+# parser.add_argument("--toy", action="store_true", help="Use only 50K samples of data")
+#
+# parser.add_argument("--with_model", action="store_true", help="Continue from previously saved model")
+
+
+class args:
+ pass
+
+
+args.num_hidden = 150
+args.num_layers = 2
+args.beam_width = 10
+args.glove = "store_true"
+args.embedding_size = 300
+
+args.learning_rate = 1e-3
+args.batch_size = 64
+args.num_epochs = 10
+args.keep_prob = 0.8
+
+args.toy = False # "store_true"
+
+args.with_model = "store_true"
+
+
+#parser = argparse.ArgumentParser()
+# add_arguments(parser)
+#args = parser.parse_args()
+# with open("args.pickle", "wb") as f:
+# pickle.dump(args, f)
+
+if not os.path.exists(default_path + "saved_model"):
+ os.mkdir(default_path + "saved_model")
+else:
+ # if args.with_model:
+ old_model_checkpoint_path = open(
+ default_path + 'saved_model/checkpoint', 'r')
+ old_model_checkpoint_path = "".join(
+ [default_path + "saved_model/", old_model_checkpoint_path.read().splitlines()[0].split('"')[1]])
+
+
+print("Building dictionary...")
+word_dict, reversed_dict, article_max_len, summary_max_len = build_dict(
+ "train", args.toy)
+print("Loading training dataset...")
+train_x, train_y = build_dataset(
+ "train", word_dict, article_max_len, summary_max_len, args.toy)
+
+tf.reset_default_graph()
+
+with tf.Session() as sess:
+ model = Model(reversed_dict, article_max_len, summary_max_len, args)
+ sess.run(tf.global_variables_initializer())
+ saver = tf.train.Saver(tf.global_variables())
+ if 'old_model_checkpoint_path' in globals():
+ print("Continuing from previous trained model:",
+ old_model_checkpoint_path, "...")
+ saver.restore(sess, old_model_checkpoint_path)
+
+ batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs)
+ num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1
+
+ print("\nIteration starts.")
+ print("Number of batches per epoch :", num_batches_per_epoch)
+ for batch_x, batch_y in batches:
+ batch_x_len = list(
+ map(lambda x: len([y for y in x if y != 0]), batch_x))
+ batch_decoder_input = list(
+ map(lambda x: [word_dict[""]] + list(x), batch_y))
+ batch_decoder_len = list(
+ map(lambda x: len([y for y in x if y != 0]), batch_decoder_input))
+ batch_decoder_output = list(
+ map(lambda x: list(x) + [word_dict[""]], batch_y))
+
+ batch_decoder_input = list(
+ map(lambda d: d + (summary_max_len - len(d)) * [word_dict[""]], batch_decoder_input))
+ batch_decoder_output = list(
+ map(lambda d: d + (summary_max_len - len(d)) * [word_dict[""]], batch_decoder_output))
+
+ train_feed_dict = {
+ model.batch_size: len(batch_x),
+ model.X: batch_x,
+ model.X_len: batch_x_len,
+ model.decoder_input: batch_decoder_input,
+ model.decoder_len: batch_decoder_len,
+ model.decoder_target: batch_decoder_output
+ }
+
+ _, step, loss = sess.run(
+ [model.update, model.global_step, model.loss], feed_dict=train_feed_dict)
+
+ if step % 1000 == 0:
+ print("step {0}: loss = {1}".format(step, loss))
+
+ if step % num_batches_per_epoch == 0:
+ hours, rem = divmod(time.perf_counter() - start, 3600)
+ minutes, seconds = divmod(rem, 60)
+ saver.save(sess, default_path +
+ "saved_model/model.ckpt", global_step=step)
+ print(" Epoch {0}: Model is saved.".format(step // num_batches_per_epoch),
+ "Elapsed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds), "\n")
diff --git a/projects/web page summation/utils/utils.py b/projects/web page summation/utils/utils.py
new file mode 100644
index 00000000..86c492ab
--- /dev/null
+++ b/projects/web page summation/utils/utils.py
@@ -0,0 +1,116 @@
+import re
+import collections
+import pickle
+import numpy as np
+from newspaper import Article
+from nltk.tokenize import word_tokenize
+from gensim.models.keyedvectors import KeyedVectors
+from gensim.test.utils import get_tmpfile
+from gensim.scripts.glove2word2vec import glove2word2vec
+
+def clean_str(sentence):
+ sentence = re.sub("[#.]+", "#", sentence)
+ return sentence
+
+
+def get_text_list(data_path, toy):
+ with open(data_path, "r", encoding="utf-8") as f:
+ if not toy:
+ return [clean_str(x.strip()) for x in f.readlines()][:200000]
+ else:
+ return [clean_str(x.strip()) for x in f.readlines()][:50]
+
+
+def build_dict(step, toy=False):
+ if step == "train":
+ train_article_list = get_text_list(train_article_path, toy)
+ train_title_list = get_text_list(train_title_path, toy)
+
+ words = list()
+ for sentence in train_article_list + train_title_list:
+ for word in word_tokenize(sentence):
+ words.append(word)
+
+ word_counter = collections.Counter(words).most_common()
+ word_dict = dict()
+ word_dict[""] = 0
+ word_dict[""] = 1
+ word_dict[""] = 2
+ word_dict[""] = 3
+ for word, _ in word_counter:
+ word_dict[word] = len(word_dict)
+
+ with open(default_path + "word_dict.pickle", "wb") as f:
+ pickle.dump(word_dict, f)
+
+ elif step == "valid":
+ with open(default_path + "word_dict.pickle", "rb") as f:
+ word_dict = pickle.load(f)
+
+ reversed_dict = dict(zip(word_dict.values(), word_dict.keys()))
+
+ article_max_len = 50
+ summary_max_len = 15
+
+ return word_dict, reversed_dict, article_max_len, summary_max_len
+
+
+def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False):
+ if step == "train":
+ article_list = get_text_list(train_article_path, toy)
+ title_list = get_text_list(train_title_path, toy)
+ elif step == "valid":
+ article_list = get_text_list(valid_article_path, toy)
+ else:
+ raise NotImplementedError
+
+ x = [word_tokenize(d) for d in article_list]
+ x = [[word_dict.get(w, word_dict[""]) for w in d] for d in x]
+ x = [d[:article_max_len] for d in x]
+ x = [d + (article_max_len - len(d)) * [word_dict[""]] for d in x]
+
+ if step == "valid":
+ return x
+ else:
+ y = [word_tokenize(d) for d in title_list]
+ y = [[word_dict.get(w, word_dict[""]) for w in d] for d in y]
+ y = [d[:(summary_max_len - 1)] for d in y]
+ return x, y
+
+
+def batch_iter(inputs, outputs, batch_size, num_epochs):
+ inputs = np.array(inputs)
+ outputs = np.array(outputs)
+
+ num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
+ for epoch in range(num_epochs):
+ for batch_num in range(num_batches_per_epoch):
+ start_index = batch_num * batch_size
+ end_index = min((batch_num + 1) * batch_size, len(inputs))
+ yield inputs[start_index:end_index], outputs[start_index:end_index]
+
+
+def get_init_embedding(reversed_dict, embedding_size):
+ #glove_file = default_path + "glove/glove.6B.300d.txt"
+ #word2vec_file = get_tmpfile(default_path + "word2vec_format.vec")
+ #glove2word2vec(glove_file, word2vec_file)
+ print("Loading Glove vectors...")
+ #word_vectors = KeyedVectors.load_word2vec_format(word2vec_file)
+
+ with open(default_path + "glove/model_glove_300.pkl", 'rb') as handle:
+ word_vectors = pickle.load(handle)
+
+ word_vec_list = list()
+ for _, word in sorted(reversed_dict.items()):
+ try:
+ word_vec = word_vectors.word_vec(word)
+ except KeyError:
+ word_vec = np.zeros([embedding_size], dtype=np.float32)
+
+ word_vec_list.append(word_vec)
+
+ # Assign random vector to , token
+ word_vec_list[2] = np.random.normal(0, 1, embedding_size)
+ word_vec_list[3] = np.random.normal(0, 1, embedding_size)
+
+ return np.array(word_vec_list)