In [1]:
import requests
from collections import Counter
from itertools import chain

In [2]:
from boltons.urlutils import URL
from boltons.urlutils import find_all_links

In [3]:
from boltons.strutils import camel2under
from boltons.strutils import under2camel
from boltons.strutils import slugify
from boltons.strutils import split_punct_ws
from boltons.strutils import ordinalize
from boltons.strutils import cardinalize
from boltons.strutils import pluralize
from boltons.strutils import singularize
from boltons.strutils import is_uuid
from boltons.strutils import html2text
from boltons.strutils import bytes2human
from boltons.strutils import find_hashtags
from boltons.strutils import gzip_bytes
from boltons.strutils import gunzip_bytes

In [4]:
from boltons.statsutils import Stats
from boltons.statsutils import describe

## jsonutils: JSON

## urlsutils: URL

In [5]:
# https://boltons.readthedocs.io/en/latest/urlutils.html

In [6]:
url = URL('https://www.amazon.in/s?k=smartphones&ref=nb_sb_noss_2')
print(url.host)
print(url.qp['k'])
print(url.scheme)
print(url.path)

www.amazon.in
smartphones
https
/s


In [7]:
url = URL('https://www.croma.com/apple-iphone-11-pro-max-midnight-green-256-gb-4-gb-ram-/p/221136')
print(url.host)
print(url.scheme)
print(url.path)
print(url.path_parts)

www.croma.com
https
/apple-iphone-11-pro-max-midnight-green-256-gb-4-gb-ram-/p/221136
('', 'apple-iphone-11-pro-max-midnight-green-256-gb-4-gb-ram-', 'p', '221136')


In [8]:
url = URL('https://www.naukri.com/job-listings-Big-Data-Engineer-Newstar-corporation-Bengaluru-4-to-7-years-280619500530?src=seo_srp&sid=15725119141420&xp=1&px=1')
print(url.host)
print(url.scheme)
print(url.path)
print(url.qp['sid'])
print(url.qp['src'])
print(url.qp['xp'])
print(url.qp['px'])
print(url.to_text())
print(url.path_parts)
print(url.qp.values())

www.naukri.com
https
/job-listings-Big-Data-Engineer-Newstar-corporation-Bengaluru-4-to-7-years-280619500530
15725119141420
seo_srp
1
1
https://www.naukri.com/job-listings-Big-Data-Engineer-Newstar-corporation-Bengaluru-4-to-7-years-280619500530?src=seo_srp&sid=15725119141420&xp=1&px=1
('', 'job-listings-Big-Data-Engineer-Newstar-corporation-Bengaluru-4-to-7-years-280619500530')
['seo_srp', '15725119141420', '1', '1']


In [9]:
sitemap = '''<url>
<loc>
https://www.naukri.com/office-of-the-district-education-officer-muzaffarpur-jobs-careers-4196292
</loc>
<priority>0.9</priority>
<changefreq>weekly</changefreq>
<lastmod>2019-10-15T04:00:03+05:30</lastmod>
</url>
<url>
<loc>
https://www.naukri.com/ambedkar-university-delhi-jobs-careers-763676
</loc>
<priority>0.9</priority>
<changefreq>weekly</changefreq>
<lastmod>2019-10-15T04:00:03+05:30</lastmod>
</url>
<url>
<loc>
https://www.naukri.com/indian-institute-of-technology-hyderabad-jobs-careers-768880
</loc>
<priority>0.9</priority>
<changefreq>weekly</changefreq>
<lastmod>2019-10-15T04:00:03+05:30</lastmod>
</url>'''

links = find_all_links(sitemap)
links_with_text = find_all_links(sitemap, with_text=True)

urls_path = [link.path for link in links]

In [10]:
print('Links: \n\n', links)

Links: 

 [URL('https://www.naukri.com/office-of-the-district-education-officer-muzaffarpur-jobs-careers-4196292'), URL('https://www.naukri.com/ambedkar-university-delhi-jobs-careers-763676'), URL('https://www.naukri.com/indian-institute-of-technology-hyderabad-jobs-careers-768880')]


In [11]:
print(urls_path)

['/office-of-the-district-education-officer-muzaffarpur-jobs-careers-4196292', '/ambedkar-university-delhi-jobs-careers-763676', '/indian-institute-of-technology-hyderabad-jobs-careers-768880']


## strutils: Text

In [12]:
camel_to_under = camel2under('CamelCase')
print(camel_to_under)

camel_case


In [13]:
under_to_camel = under2camel('not_camel_case')
print(under_to_camel)

NotCamelCase


In [14]:
slug = slugify('This is a blog post')
print(slug)

this_is_a_blog_post


In [15]:
slug = slugify("this is my first post", delim='-')
print(slug)

this-is-my-first-post


In [16]:
to_split = split_punct_ws(
    'job-listings-Big-Data-Engineer-Newstar-corporation-Bengaluru-4-to-7-years-280619500530'
)
print(to_split)

# split_punct_ws can be useful in webscraping to extract data easily from the URL
# sometimes, this allows you to avoid hitting the page as the URL itself can contain the info desired
# note: this splits on punctuation and whitespace

['job', 'listings', 'Big', 'Data', 'Engineer', 'Newstar', 'corporation', 'Bengaluru', '4', 'to', '7', 'years', '280619500530']


In [17]:
ordinalize(1)

'1st'

In [18]:
num_list = [1, 2, 22, 100, 3, 44, 72, 1000]
ordinalized = [ordinalize(num) for num in num_list]
print(ordinalized)

['1st', '2nd', '22nd', '100th', '3rd', '44th', '72nd', '1000th']


In [19]:
v = 'aeiou'
cardinalize('vowel', len(v))

'vowels'

In [20]:
v = 'a'
cardinalize('vowel', len(v))

'vowel'

In [21]:
f = ('Amanita argentea', 'Calyptella campanula', 'Entoloma chalybaeum')
cardinalize('fungus', Counter(v))

'fungi'

In [22]:
print(pluralize('mouse'))
print(pluralize('student'))
print(pluralize('Number'))

# returns the plural of a word and preserves the case

mice
students
Numbers


In [23]:
print(singularize('mice'))
print(singularize('fungi'))
print(singularize('Floors'))

# returns the singular of a word and preserves the case

mouse
fungus
Floor


In [24]:
to_check_1 = 'e682ccca-5a4c-4ef2-9711-73f9ad1e15ea'
to_check_2 = '0221f0d9-d4b9-11e5-a478-10ddb1c2feb9'
print(is_uuid(to_check_1))
print(is_uuid(to_check_2))
print(is_uuid(to_check_2, version=1))

True
False
True


In [25]:
sitemap = '''<url>
<loc>
https://www.naukri.com/office-of-the-district-education-officer-muzaffarpur-jobs-careers-4196292
</loc>
<priority>0.9</priority>
<changefreq>weekly</changefreq>
<lastmod>2019-10-15T04:00:03+05:30</lastmod>
</url>
<url>
<loc>
https://www.naukri.com/ambedkar-university-delhi-jobs-careers-763676
</loc>
<priority>0.9</priority>
<changefreq>weekly</changefreq>
<lastmod>2019-10-15T04:00:03+05:30</lastmod>
</url>
<url>
<loc>
https://www.naukri.com/indian-institute-of-technology-hyderabad-jobs-careers-768880
</loc>
<priority>0.9</priority>
<changefreq>weekly</changefreq>
<lastmod>2019-10-15T04:00:03+05:30</lastmod>
</url>'''

print(html2text(sitemap))

# the above is not html but this obviously works on xml as well



https://www.naukri.com/office-of-the-district-education-officer-muzaffarpur-jobs-careers-4196292

0.9
weekly
2019-10-15T04:00:03+05:30



https://www.naukri.com/ambedkar-university-delhi-jobs-careers-763676

0.9
weekly
2019-10-15T04:00:03+05:30



https://www.naukri.com/indian-institute-of-technology-hyderabad-jobs-careers-768880

0.9
weekly
2019-10-15T04:00:03+05:30



In [26]:
bytes_1 = 10000001221
bytes_2 = 101221

print(bytes2human(bytes_1, 2))
print(bytes2human(bytes_2, 2))

9.31G
98.85K


In [27]:
sample_text = 'A hashtag looks something like this:  #python or #github.'

print(find_hashtags(sample_text))

['python', 'github']


In [28]:
response = requests.get('https://en.wikipedia.org/wiki/The_Wreck_of_the_Titan:_Or,_Futility').content
r_gzipped = gzip_bytes(bytes(response), level=9) # level 9 is highest, 1 is lowest

print(len(response))
print(len(r_gzipped))

75787
19275


## statsutils - Statistics Basics

In [29]:
num_list = [22, 1, 983, 1, 0, 24, 6, 7, 9, 4, 7, 777]
stats = Stats(num_list)

print(stats.mean)
print(stats.min)
print(stats.max)
print(stats.variance)
print(stats.median)
print(stats.count)
print(stats.std_dev)
print(stats.describe())

153.41666666666666
0
983
107405.90972222223
7.0
12
327.7284084760158
{'count': 12, 'mean': 153.41666666666666, 'std_dev': 327.7284084760158, 'mad': 6.0, 'min': 0, '0.25': 3.25, '0.5': 7.0, '0.75': 22.5, 'max': 983}


In [30]:
print(stats.format_histogram())

  0.0: 8 ######################################################################
 16.8: 2 ##################
 33.6: 0 |
 50.4: 0 |
 67.2: 0 |
 84.0: 0 |
100.8: 0 |
117.7: 0 |
134.5: 0 |
151.3: 0 |
168.1: 0 |
184.9: 0 |
201.7: 0 |
218.6: 0 |
235.4: 0 |
252.2: 0 |
269.0: 0 |
285.8: 0 |
302.6: 0 |
319.5: 0 |
336.3: 0 |
353.1: 0 |
369.9: 0 |
386.7: 0 |
403.5: 0 |
420.4: 0 |
437.2: 0 |
454.0: 0 |
470.8: 0 |
487.6: 0 |
504.4: 0 |
521.3: 0 |
538.1: 0 |
554.9: 0 |
571.7: 0 |
588.5: 0 |
605.3: 0 |
622.2: 0 |
639.0: 0 |
655.8: 0 |
672.6: 0 |
689.4: 0 |
706.2: 0 |
723.1: 0 |
739.9: 0 |
756.7: 0 |
773.5: 1 #########
790.3: 0 |
807.1: 0 |
824.0: 0 |
840.8: 0 |
857.6: 0 |
874.4: 0 |
891.2: 0 |
908.0: 0 |
924.9: 0 |
941.7: 0 |
958.5: 0 |
975.3: 1 #########


In [31]:
print(stats.median_abs_dev) # median absolute deviation
print(stats.trimean)
print(stats.kurtosis)
print(stats.skewness)
print(stats.mad)

6.0
9.9375
4.95977332986754
2.0207968699585535
6.0


In [32]:
print(stats.iqr) # interquartile range
print(stats.get_quantile(0.8))
print(stats.get_zscore(55))
print(stats.rel_std_dev) # relative standard deviate
print(stats.pearson_type)

19.25
23.6
-0.30029946785607725
2.1361982084259585
1


In [33]:
stats.get_histogram_counts()

[(0.0, 8),
 (16.8, 2),
 (33.6, 0),
 (50.4, 0),
 (67.2, 0),
 (84.0, 0),
 (100.8, 0),
 (117.7, 0),
 (134.5, 0),
 (151.3, 0),
 (168.1, 0),
 (184.9, 0),
 (201.7, 0),
 (218.6, 0),
 (235.4, 0),
 (252.2, 0),
 (269.0, 0),
 (285.8, 0),
 (302.6, 0),
 (319.5, 0),
 (336.3, 0),
 (353.1, 0),
 (369.9, 0),
 (386.7, 0),
 (403.5, 0),
 (420.4, 0),
 (437.2, 0),
 (454.0, 0),
 (470.8, 0),
 (487.6, 0),
 (504.4, 0),
 (521.3, 0),
 (538.1, 0),
 (554.9, 0),
 (571.7, 0),
 (588.5, 0),
 (605.3, 0),
 (622.2, 0),
 (639.0, 0),
 (655.8, 0),
 (672.6, 0),
 (689.4, 0),
 (706.2, 0),
 (723.1, 0),
 (739.9, 0),
 (756.7, 0),
 (773.5, 1),
 (790.3, 0),
 (807.1, 0),
 (824.0, 0),
 (840.8, 0),
 (857.6, 0),
 (874.4, 0),
 (891.2, 0),
 (908.0, 0),
 (924.9, 0),
 (941.7, 0),
 (958.5, 0),
 (975.3, 1)]

In [34]:
describe(num_list)

{'count': 12,
 'mean': 153.41666666666666,
 'std_dev': 327.7284084760158,
 'mad': 6.0,
 'min': 0,
 '0.25': 3.25,
 '0.5': 7.0,
 '0.75': 22.5,
 'max': 983}

## tableutils

In [35]:
# https://boltons.readthedocs.io/en/latest/tableutils.html

## fileutils

In [36]:
# https://boltons.readthedocs.io/en/latest/fileutils.html

## ecoutils

In [37]:
# https://boltons.readthedocs.io/en/latest/ecoutils.html

## ioutils

In [38]:
# https://boltons.readthedocs.io/en/latest/_modules/boltons/ioutils.html#MultiFileReader

## setutils

In [39]:
# https://boltons.readthedocs.io/en/latest/setutils.html

## iterutils

In [40]:
# https://boltons.readthedocs.io/en/latest/iterutils.html