Permalink
Browse files

major refactoring

  • Loading branch information...
RichStone committed Aug 15, 2018
1 parent c8da2b5 commit 9d8c3d5a6dc3a177d5a4c959a39e2a0ef3e399cd
Showing with 86 additions and 42 deletions.
  1. +47 −23 tests/test_url_parser.py
  2. +39 −19 url_downloader/url_argument_parser.py
@@ -7,42 +7,66 @@

class TestUrlParser(unittest.TestCase):
def setUp(self):
pass
self.parser = url_argument_parser.Parser()

def test_argument_received(self):
arg = 'http://example.com/+++1***2300+++'
self.parser = url_argument_parser.Parser(arg)
self.assertIsNotNone(self.parser.user_input, '')
user_input = ''
with self.assertRaises(ValueError):
self.parser.build_clean_url(user_input)

user_input = None
with self.assertRaises(ValueError):
self.parser.build_clean_url(user_input)

user_input = 13245
with self.assertRaises(ValueError):
self.parser.build_clean_url(user_input)

def test_extract_base_url(self):
arg = 'http://example.com/+++1***2300+++'
self.parser = url_argument_parser.Parser(arg)
self.assertEqual(self.parser.BASE_URL, 'http://example.com')
user_input = 'http://example.com/+++1***2300+++'
base_url = self.parser.extract_base_url(user_input)
self.assertEqual(base_url, 'http://example.com')

arg = 'https://www.example.com/+++1***2300+++'
self.parser = url_argument_parser.Parser(arg)
self.assertEqual(self.parser.BASE_URL, 'https://www.example.com')
user_input = 'https://www.example.com/+++1***2300+++'
base_url = self.parser.extract_base_url(user_input)
self.assertEqual(base_url, 'https://www.example.com')

def test_base_url_valid_connection(self):
arg = 'https://google.com'
self.parser = url_argument_parser.Parser(arg)
http_response_code = self.parser.validate_connection()
self.assertGreaterEqual(http_response_code, 200)
self.assertLessEqual(http_response_code, 299)
url = 'https://google.com'
http_response_code = self.parser.validate_base_url_connection(url)
self.assertEqual(http_response_code, 200)

def test_connection_exception_on_(self):
with self.assertRaises(URLError):
arg = 'https://non-existing-url-dsad.com/'
dl = url_argument_parser.Parser(arg)
url = 'https://non-existing-url-dsad.com/'
self.parser.validate_base_url_connection(url)

def test_extract_custom_part(self):
arg = 'http://example.com/+++1***2300+++'
self.parser = url_argument_parser.Parser(arg)
self.assertEqual('/+++1***2300+++', self.parser.extract_custom_part())
user_input = 'http://example.com/++1**2300++'
self.assertEqual('/++1**2300++', self.parser.extract_custom_url_part(user_input))

user_input = 'https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed+++0001***0928+++.xml.gz'
self.assertEqual('/pubmed/baseline/pubmed+++0001***0928+++.xml.gz', self.parser.extract_custom_url_part(user_input))

def test_extract_ranges(self):
custom_url_part = '/pubmed++0001**0928++.xml.gz'
ranges = [{
'start_from': '0001',
'end_at': '0928'
}]
self.assertEqual(ranges, self.parser.extract_ranges(custom_url_part))

arg = 'https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed+++0001***0928+++.xml.gz'
self.parser = url_argument_parser.Parser(arg)
self.assertEqual('/pubmed/baseline/pubmed+++0001***0928+++.xml.gz', self.parser.extract_custom_part())
custom_url_part = 'http://datagoodie.com/month/++1**12++/day/++1**30++'
ranges = [{
'start_from': '1',
'end_at': '12'
},
{
'start_from': '1',
'end_at': '30'
}
]
self.assertEqual(ranges, self.parser.extract_ranges(custom_url_part))

@unittest.skip('activate after finishing parser')
def test_except_on_invalid_concat_input(self):
@@ -2,6 +2,7 @@
from urllib.parse import urlsplit, urlunsplit
import urllib.request
from urllib.error import URLError
import re

import validators

@@ -11,39 +12,58 @@


class Parser:
def __init__(self, arg):
self.user_input = arg
self.CONCAT_DELIMITER = '##'
self.RANGE_DELIMITER = '**'
self.BASE_URL = self.extract_base_url()
def __init__(self):
self.concat_delimiter = '++'
self.range_delimiter = '**'
# regex needs delimiters to be escaped
self.concat_delimiter_escaped = '\+\+'
self.range_delimiter_escaped = '\*\*'

self.validate_connection()

def validate_user_input(self):
def validate_base_url():
if validators.url(self.BASE_URL) is validators.utils.ValidationFailure:
raise ValueError('Base URL is malformed, please keep to the following format: '
'"http://www.example.com/" ')
def build_clean_url(self, user_input):
if user_input == '' or user_input is None or not isinstance(user_input, str):
raise ValueError('Error: you have to pass a valid String to the parser.')
base_url = self.extract_base_url(user_input)
self.validate_base_url_connection(base_url)
custom_url_part = self.extract_custom_url_part(user_input)
ranges = self.extract_ranges(custom_url_part)

def validate_user_input(self, user_input):
def validate_concat_sequences():
pass

def extract_base_url(self):
split_url = urlsplit(self.user_input)
def extract_base_url(self, user_input):
split_url = urlsplit(user_input)
base_url = urlunsplit((split_url.scheme, split_url.netloc, '', '', ''))
if validators.url(base_url) is validators.utils.ValidationFailure:
raise ValueError('Base URL is malformed, please keep to the following format: '
'"http://www.example.com/" ')
return base_url

def extract_custom_part(self):
split_url = urlsplit(self.user_input)
def extract_custom_url_part(self, user_input):
split_url = urlsplit(user_input)
custom_part = urlunsplit(('', '', split_url.path, '', ''))
return custom_part

def validate_connection(self):
def extract_ranges(self, custom_url_part):
ranges = []
extracted_ranges = re.findall(r'(?<=' + self.concat_delimiter_escaped + ').+?(?=' + self.concat_delimiter_escaped + ')',
custom_url_part)
if extracted_ranges is not []:
for extracted_range in extracted_ranges:
if self.range_delimiter in extracted_range:
range_obj = dict()
split_range = extracted_range.split(self.range_delimiter)
range_obj['start_from'] = split_range[0]
range_obj['end_at'] = split_range[1]
ranges.append(range_obj)
return ranges

def validate_base_url_connection(self, base_url):
try:
return urllib.request.urlopen(self.BASE_URL).getcode()
return urllib.request.urlopen(base_url).getcode()
except URLError:
raise URLError('Terminate program because connection could not be established with the given base URL '
+ self.BASE_URL)
+ base_url)


if __name__ == 'main':

0 comments on commit 9d8c3d5

Please sign in to comment.