Skip to content
This repository was archived by the owner on May 25, 2022. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions projects/web page summation/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
venv/
env.bak/
venv.bak/
env/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

__pycache__
.vscode
settings.json

Dependency directories
node_modules/
jspm_packages/

# Optional npm cache directory
.npm
.DS_Store
.DS_Store
datasets
datasets/
new_datasets/
node_modules
yarn.lock
app
__pycache__/
dist
build
mlclassification-darwin-x64
release-builds
Classifi
app
dist
build
Summarize.spec
__pycache__
applog.log
csv/
beneficiary.csv
.DS_Store
applog.log
84 changes: 84 additions & 0 deletions projects/web page summation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Website Summarization API

This project is carried out for the purpose of building a machine learning model for summarising a website from urls;

## Getting Started

These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.


### Prerequisites

Python distribution

```
Anaconda
```

### Installing

Install Anaconda python distribution on your system

Create a virtual environment called env.

```
python -m venv app
```

Activate the virtual environment

```
LINUX/Mac: source app/bin/activate

Windows: app\Scripts\activate
```

Upgrade to the latest pip

```
pip install --upgrade pip
```

Install dependencies using requirements file

```
pip install -r requirements.txt
```
**Note: Your virtual environment must always be activated before running any command**

## Deployment

Start app (Make sure to enter a valid website to an existing website)


Example of valid commands

```
python app.py simple --url https://facebook.com --sentence 1 --language english
python app.py simple --url https://facebook.com
python app.py simple --url https://korapay.com
python app.py bulk --path ./csv/valid_websites.csv
```


### APIs

This are command options in full:

```
A command line utility for website Summarization.
-----------------------------------------------
These are common commands for this app.

positional arguments:
action This has to be 'summarize'

optional arguments:
-h, --help show this help message and exit
--website PATH website of the url to be summarised


## License

This project is licensed under the MIT License - see the [LICENSE](LICENSE.md) file for details

146 changes: 146 additions & 0 deletions projects/web page summation/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#!/usr/bin/python
from tempfile import NamedTemporaryFile
from utils.summarize import summarize
import csv
import json
import shutil
import os
import textwrap
import logging
import signal
import argparse
import sys
import getopt


def parse_args(argv):
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent('''\
A command line utility for website summarization.
-----------------------------------------------
These are common commands for this app.'''))
parser.add_argument(
'action',
help='This action should be summarize')
parser.add_argument(
'--url',
help='A link to the website url'
)
parser.add_argument(
'--sentence',
help='Argument to define number of sentence for the summary',
type=int,
default=2)
parser.add_argument(
'--language',
help='Argument to define language of the summary',
default='English')
parser.add_argument(
'--path',
help='path to csv file')

return parser.parse_args(argv[1:])


def readCsv(path):
print('\n\n Processing Csv file \n\n')
sys.stdout.flush()
data = []
try:
with open(path, 'r') as userFile:
userFileReader = csv.reader(userFile)
for row in userFileReader:
data.append(row)
except:
with open(path, 'r', encoding="mbcs") as userFile:
userFileReader = csv.reader(userFile)
for row in userFileReader:
data.append(row)
return data


def writeCsv(data, LANGUAGE, SENTENCES_COUNT):
print('\n\n Updating Csv file \n\n')
sys.stdout.flush()
with open('beneficiary.csv', 'w') as newFile:
newFileWriter = csv.writer(newFile)
length = len(data)
position = data[0].index('website')
for i in range(1, length):
if i is 1:
_data = data[0]
_data.append("summary")
newFileWriter.writerow(_data)
try:
__data = data[i]
summary = summarize(
(data[i][position]), LANGUAGE, SENTENCES_COUNT)
__data.append(summary)
newFileWriter.writerow(__data)
except:
print('\n\n Error Skipping line \n\n')
sys.stdout.flush()


def processCsv(path, LANGUAGE, SENTENCES_COUNT):
try:
print('\n\n Proessing Started \n\n')
sys.stdout.flush()
data = readCsv(path)
writeCsv(data, LANGUAGE, SENTENCES_COUNT)
except:
print('\n\n Invalid file in file path \n\n')
sys.stdout.flush()


def main(argv=sys.argv):
# Configure logging
logging.basicConfig(filename='applog.log',
filemode='w',
level=logging.INFO,
format='%(levelname)s:%(message)s')
args = parse_args(argv)
action = args.action
url = args.url
path = args.path
LANGUAGE = "english" if args.language is None else args.language
SENTENCES_COUNT = 2 if args.sentence is None else args.sentence
if action == 'bulk':
if path is None:
print(
'\n\n Invalid Entry!, please Ensure you enter a valid file path \n\n')
sys.stdout.flush()
return
# guide against errors
try:
processCsv(path, LANGUAGE, SENTENCES_COUNT)
except:
print(
'\n\n Invalid Entry!, please Ensure you enter a valid file path \n\n')
sys.stdout.flush()
print('Completed')
sys.stdout.flush()
if os.path.isfile('beneficiary.csv'):
return shutil.move('beneficiary.csv', path)
return
if action == 'simple':
# guide against errors
try:
summary = summarize(url, LANGUAGE, SENTENCES_COUNT)
except:
print(
'\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n')
sys.stdout.flush()
print('Completed')
sys.stdout.flush()
else:
print(
'\nAction command is not supported\n for help: run python3 app.py -h'
)
sys.stdout.flush()
return


if __name__ == '__main__':
main()
5 changes: 5 additions & 0 deletions projects/web page summation/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
sumy

nltk
numpy
argparse
Empty file.
Loading