In [11]:
import pandas as pd
import re

hn = pd.read_csv("hacker_news.csv")
titles = hn['title']

In [12]:
'''no of sql with all cases
'''
sql_pattern=r"SQL"
sql_counts=titles.str.contains(sql_pattern,flags=re.I).sum()
sql_counts

108

In [13]:
'''extend this analysis by looking at titles that have letters
immediately before the "SQL," which is a convention often used to denote different variations or flavors of SQL:'''
# pattern = r"(\w+SQL)" #parenthesis to capture
# sql_flavors = titles.str.extract(pattern, flags=re.I)
# sql_flavors_freq = sql_flavors.value_counts()
# print(sql_flavors_freq)


hn_sql = hn[hn['title'].str.contains(r"\w+SQL", flags=re.I)].copy()
hn_sql['flavor']=hn_sql['title'].str.extract(r'(\w+SQL)', flags=re.I)
hn_sql['flavor']=hn_sql['flavor'].str.lower()
sql_pivot=hn_sql.pivot_table(index='flavor',values="num_comments",aggfunc='mean') #create a pivot table, sql_pivot.
sql_pivot.head()

Unnamed: 0_level_0,num_comments
flavor,Unnamed: 1_level_1
cloudsql,5.0
memsql,14.0
mysql,12.230769
nosql,14.529412
postgresql,25.962963


In [14]:
'''regular expression pattern which will match Python or python, followed by a space, followed by one or more
digit characters or periods.
The regular expression should contain a capture group for the digit and period characters (the Python versions)'''

# pattern = r"[Pp]ython ([\d\.]+)"

# py_versions = titles.str.extract(pattern)
# py_versions_freq = dict(py_versions.value_counts())
# py_versions


#first 10 'C' capture

def first_10_matches(pattern):
    """
    Return the first 10 story titles that match
    the provided regular expression
    """
    all_matches = titles[titles.str.contains(pattern)]
    first_10 = all_matches.head(10)
    return first_10

# pattern = r"\b[Cc]\b"
pattern = r"\b[Cc]\b[^.+]"
first_ten = first_10_matches(pattern)
first_ten

365                      The new C standards are worth it
444           Moz raises $10m Series C from Foundry Group
521          Fuchsia: Micro kernel written in C by Google
1307            Show HN: Yupp, yet another C preprocessor
1326                     The C standard formalized in Coq
1365                          GNU C Library 2.23 released
1429    Cysignals: signal handling (SIGINT, SIGSEGV, )...
1620                        SDCC  Small Device C Compiler
1949    Rewriting a Ruby C Extension in Rust: How a Na...
2195    MyHTML  HTML Parser on Pure C with POSIX Threa...
Name: title, dtype: object

In [15]:
test_urls = pd.Series([
 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',
 'http://www.interactivedynamicvideo.com/',
 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',
 'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',
 'HTTPS://github.com/keppel/pinn',
 'Http://phys.org/news/2015-09-scale-solar-youve.html',
 'https://iot.seeed.cc',
 'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',
 'http://beta.crowdfireapp.com/?beta=agnipath',
 'https://www.valid.ly?param',
 'http://css-cursor.techstream.org'
])
pattern = r"https?://([\w\-\.]+)" 

test_urls_clean = test_urls.str.extract(pattern, flags=re.I)
domains = hn['url'].str.extract(pattern, flags=re.I)
top_domains = domains.head(5)
top_domains

Unnamed: 0,0
0,www.interactivedynamicvideo.com
1,www.thewire.com
2,www.amazon.com
3,www.nytimes.com
4,arstechnica.com


In [16]:
'''
extracting each of the three component parts of the URLs:

Protocol
Domain
Page path
'''
pattern = r"(https?)://([\w\.\-]+)/?(.*)" #

test_url_parts = test_urls.str.extract(pattern, flags=re.I)
url_parts = hn['url'].str.extract(pattern, flags=re.I)
url_parts


Unnamed: 0,0,1,2
0,http,www.interactivedynamicvideo.com,
1,http,www.thewire.com,entertainment/2013/04/florida-djs-april-fools-...
2,https,www.amazon.com,Technology-Ventures-Enterprise-Thomas-Byers/dp...
3,http,www.nytimes.com,2007/11/07/movies/07stein.html?_r=0
4,http,arstechnica.com,business/2015/10/comcast-and-other-isps-boost-...
...,...,...,...
20094,https,puri.sm,philosophy/how-purism-avoids-intels-active-man...
20095,https,medium.com,@zreitano/the-yc-application-broken-down-and-t...
20096,http,blog.darknedgy.net,technology/2016/01/01/0/
20097,https,medium.com,@benjiwheeler/how-product-hunt-really-works-d8...


In [17]:
#another methode for same

pattern = r"(?P<protocol>https?)://(?P<domain>[\w\.\-]+)/?(?P<path>.*)"
url_parts = hn['url'].str.extract(pattern, flags=re.I)
url_parts

Unnamed: 0,protocol,domain,path
0,http,www.interactivedynamicvideo.com,
1,http,www.thewire.com,entertainment/2013/04/florida-djs-april-fools-...
2,https,www.amazon.com,Technology-Ventures-Enterprise-Thomas-Byers/dp...
3,http,www.nytimes.com,2007/11/07/movies/07stein.html?_r=0
4,http,arstechnica.com,business/2015/10/comcast-and-other-isps-boost-...
...,...,...,...
20094,https,puri.sm,philosophy/how-purism-avoids-intels-active-man...
20095,https,medium.com,@zreitano/the-yc-application-broken-down-and-t...
20096,http,blog.darknedgy.net,technology/2016/01/01/0/
20097,https,medium.com,@benjiwheeler/how-product-hunt-really-works-d8...
