# URL normalization (references)

In this Notebook, the normalization of external URLs included in references is carried out.

In [1]:
import re
import urllib
from url_normalize import url_normalize
from scrapy.utils.url import canonicalize_url
import pandas as pd

# 0. Import the dataset

Before applying the whole transformation process, a small test of the packages is carried out with the Wikipedia data. The dataset is composed by 29,276,667 rows.

In [2]:
df = pd.read_csv('references/ref_data.csv')
df

Unnamed: 0,id,type_of_citation,TitleType,Title,ID_list,URL,updated_identifier
0,39495284,citation,,Manhattan in 1628 as Described in the Recently...,,http://www.columbia.edu/cu/lweb/digital/collec...,
1,3528721,citation,,Kikwete Wins Election By the Biggest Margin Ever,,http://allafrica.com/stories/200512190886.html,
2,17005434,citation,,The Romance of Arthur,,,
3,8552398,citation,,The Thin Red Line,,https://www.nytimes.com/1997/07/27/magazine/th...,
4,23782313,citation,,Skin Deep but Filled With Significance,,https://www.nytimes.com/2005/07/19/arts/televi...,
...,...,...,...,...,...,...,...
29276662,54814519,soccerbase season,,,,,
29276663,59566627,soccerbase season,,,,,
29276664,55285198,soccerbase season,,,,,
29276665,55285198,soccerbase season,,,,,


These are the external links included in the Wikipedia page of The Strokes.

In [3]:
df[df['id'] == 148546]

Unnamed: 0,id,type_of_citation,TitleType,Title,ID_list,URL,updated_identifier
268813,148546,cite web,,"The Strokes\' ""Disagreement"" Over New Album \u...",,http://www.clashmusic.com/news/the-strokes-dis...,
289085,148546,cite web,,The Strokes Go New Wave on Latest Single \u201...,,https://www.rollingstone.com/music/music-news/...,
353357,148546,cite news,,The Strokes join Blur and Taylor Swift to head...,,https://www.independent.co.uk/arts-entertainme...,
423840,148546,cite web,,The Strokes Return / Music / ShortList Magazine,,http://www.shortlist.com/entertainment/music/a...,
1019943,148546,cite web,,The Strokes hit the studio to work on 'new ideas',,https://www.nme.com/news/the-strokes/56306,
...,...,...,...,...,...,...,...
28927798,148546,cite web,,Interview with Joe Chiccarelli,,http://www.hitquarters.com/index.php3?page=int...,
28970881,148546,cite web,,Archived copy,,http://www.thestrokes.com/us/event/2015/10/02/...,
29089418,148546,cite web,,Details: New Strokes Album On The Way,,http://www.1077theend.com/Details--New-Strokes...,
29139876,148546,cite news,,"Watch The Strokes play the ""biggest show"" of t...",,https://www.nme.com/news/music/the-strokes-big...,


In [3]:
df[df['id'] == 1911188]

Unnamed: 0,id,type_of_citation,TitleType,Title,ID_list,URL,updated_identifier
9755985,1911188,cite web,,"May 19, 2017 letter to Appalachian School of L...",,https://www.americanbar.org/content/dam/aba/ad...,
10035117,1911188,cite web,,Appalachian School of Law,,https://www.usnews.com/best-graduate-schools/t...,
13518084,1911188,cite news,,2 Florida Law Schools Among 10 Sanctioned by A...,,https://www.law.com/dailybusinessreview/sites/...,
23429079,1911188,cite web,,McClanahan to serve as new law school dean,,https://www.swvatoday.com/article_99cfe646-669...,
27244267,1911188,cite web,,The 2017 Bottom 10 Law Schools in the Country,,http://www.thefacultylounge.org/2017/12/the-20...,


In [6]:
df_freq = df.groupby('id').size().reset_index(name='references')
df_freq.rename(columns={'id':'page_id'}, inplace=True)
df_freq

Unnamed: 0,page_id,references
0,12,92
1,25,226
2,39,37
3,290,30
4,303,207
...,...,...
3775226,63833859,4
3775227,63833892,11
3775228,63833916,3
3775229,63833956,1


In [10]:
df_freq.to_csv('references/references_freq_total.tsv', sep='\t', index=False)

There are some missing URLs. In some cases a Wikipedia page does not include the URL tag correctly and this URL is not captured/parsed.

In [4]:
df[df['id'] == 17005434]

Unnamed: 0,id,type_of_citation,TitleType,Title,ID_list,URL,updated_identifier
2,17005434,citation,,The Romance of Arthur,,,
1169979,17005434,citation,,The New Arthurian Encyclopedia,,,
10246950,17005434,citation,,The Romance of Arthur,,,
15072275,17005434,citation,,The Romance of Arthur,,,
25027126,17005434,citation,,The Romance of Arthur,,,
28690763,17005434,citation,,The Romance of Arthur,,,


In [8]:
df[df['id'] == 39]

Unnamed: 0,id,type_of_citation,TitleType,Title,ID_list,URL,updated_identifier
1067681,39,cite journal,,The effect of spectral albedo on amorphous sil...,"{BIBCODE=2013SoEn...91..233A, DOI=10.1016/j.so...",https://www.academia.edu/3081684,
1883374,39,cite web,,Baffled Scientists Say Less Sunlight Reaching ...,,http://www.livescience.com/environment/060124_...,
2019988,39,cite web,,Albedo \u2013 from Eric Weisstein's World of P...,,http://scienceworld.wolfram.com/physics/Albedo...,
3116732,39,cite journal,,"Boreal forests, aerosols and the impacts on cl...","{PMID=18826917, BIBCODE=2008RSPTA.366.4613S, D...",http://homepages.see.leeds.ac.uk/~eardvs/paper...,
3516002,39,cite journal,,Earthshine Observations of the Earth's Reflect...,"{BIBCODE=2001GeoRL..28.1671G, DOI=10.1029/2000...",http://www.agu.org/journals/ABS/2001/2000GL012...,
9122272,39,cite journal,,The Study of the Ocean and the Land Surface fr...,"{BIBCODE=1983RSPTA.309..285H, JSTOR=37357, DOI...",,
9411229,39,cite journal,,"Forests and Climate Change: Forcings, Feedback...","{PMID=18556546, BIBCODE=2008Sci...320.1444B, D...",https://zenodo.org/record/1230896,
10961684,39,cite web,,Health and Safety: Be Cool! (August 1997),,http://www.ranknfile-ue.org/h&s0897.html,
11243596,39,cite web,,Spectral Approach To Calculate Specular reflec...,,http://vih.freeshell.org/pp/01-ONW-St.Petersbu...,
11318505,39,cite journal,,The spherical bolometric albedo for planet Mer...,{ARXIV=1703.02670},,


# 1. Preprocessing

The dataset it is filtered to only URL mentions. Also this URLs are reviewed to remove erroneous strings assigned as URL.

In [5]:
df = df[df[['id', 'URL']].notnull().all(axis=1)]
df = df[['id', 'URL']]
df = df[df.URL.str.contains('^http|^www[0-9]{0,2}')]
df.head()

Unnamed: 0,id,URL
0,39495284,http://www.columbia.edu/cu/lweb/digital/collec...
1,3528721,http://allafrica.com/stories/200512190886.html
3,8552398,https://www.nytimes.com/1997/07/27/magazine/th...
4,23782313,https://www.nytimes.com/2005/07/19/arts/televi...
6,17288329,http://www.ancientlibrary.com/smith-bio/0075.html


Before obtaining the domains or modify the URLs the top [web archives](https://en.wikipedia.org/wiki/Wikipedia:List_of_web_archives_on_Wikipedia) URLs are transformed in order to take into account the real URLs. Then, all the URLs are filtered to remove erroenous URLs.

## 1.1 Web archive

In [6]:
df['URL'] = [re.sub('http[s]{0,1}://(web\.archive|waybackmachine)\.org/.*http', 'http', x) for x in df['URL']]
df['URL'] = [re.sub('http[s]{0,1}://(web\.archive|waybackmachine)\.org/.*www', 'www', x) for x in df['URL']]
df['URL'] = [re.sub('http[s]{0,1}://(web\.archive|waybackmachine)\.org/web/([0-9a-z]*|[0-9a-z]*\*|\*)/|http[s]{0,1}://(web\.archive|waybackmachine)\.org/\*/', '', x) for x in df['URL']]

## 1.2 archive.today

It should be noticed that not all URLs are encoded in the same way. Whereas some URLs include the archived URL others do not. There are also multiple domains.

In [7]:
df['URL'] = [re.sub('http[s]{0,1}://archive\.(today|is|fo|li|vn|md)/.*http', 'http', x) for x in df['URL']]
df['URL'] = [re.sub('http[s]{0,1}://archive\.(today|is|fo|li|vn|md)/.*www', 'www', x) for x in df['URL']]

## 1.3 Webcitation

In this case, most of them does not include the archived URL.

In [8]:
df['URL'] = [re.sub('http[s]{0,1}://((www\.)?)webcitation\.org/.*http', 'http', x) for x in df['URL']]
df['URL'] = [re.sub('http[s]{0,1}://((www\.)?)webcitation\.org/.*www', 'www', x) for x in df['URL']]

Finally, they are cleaned to get only URLs strings.

In [9]:
df = df[df.URL.str.contains('^http|^www[0-9]{0,2}')]

Before the preprocessing, there are 19,073,929 unique URLs.

In [10]:
len(df.groupby('URL').count().index)

19073929

A new column is created to normalize the URLs.

In [11]:
df['URL_n'] = [re.sub('^http://www[0-9]{0,2}\\.|^http://|^https://www[0-9]{0,2}\\.|^https://|^//www[0-9]{0,2}\\.', '', x) for x in df['URL']]
df['URL_n'] = [re.sub('^http[s]{0,1}%3A%2F%2F((www\.)?)', '', x) for x in df['URL']]
#df.loc[:,'URL_n']  = df.loc[:,'URL_n'].str.lower()
df.head()

Unnamed: 0,id,URL,URL_n
0,39495284,http://www.columbia.edu/cu/lweb/digital/collec...,http://www.columbia.edu/cu/lweb/digital/collec...
1,3528721,http://allafrica.com/stories/200512190886.html,http://allafrica.com/stories/200512190886.html
3,8552398,https://www.nytimes.com/1997/07/27/magazine/th...,https://www.nytimes.com/1997/07/27/magazine/th...
4,23782313,https://www.nytimes.com/2005/07/19/arts/televi...,https://www.nytimes.com/2005/07/19/arts/televi...
6,17288329,http://www.ancientlibrary.com/smith-bio/0075.html,http://www.ancientlibrary.com/smith-bio/0075.html


There are some characters that usually appear at the end of the URL, some of them escape characters, so they are removed.

In [12]:
#df['URL_n'] = [re.sub('#.*$', '', x) for x in df['URL_n']]
#df['URL_n'] = [re.sub('#ixzz.*$', '', x) for x in df['URL_n']]
df['URL_n'] = [re.sub('((\\\\[ntr])+)$|\\\\+$', '', x) for x in df['URL_n']]
df['URL_n'] = [re.sub('/+$', '', x) for x in df['URL_n']]
#df['URL_n'] = [re.sub('\\.[s]{0,1}(htm|html|xml)$|\\.[s]{0,1}(htm|html|xml)$|\\.[s]{0,1}(htm|html|xml)/+$|\\.[s]{0,1}(htm|html|xml)/+$', '', x) for x in df['URL_n']]
df.head()

Unnamed: 0,id,URL,URL_n
0,39495284,http://www.columbia.edu/cu/lweb/digital/collec...,http://www.columbia.edu/cu/lweb/digital/collec...
1,3528721,http://allafrica.com/stories/200512190886.html,http://allafrica.com/stories/200512190886.html
3,8552398,https://www.nytimes.com/1997/07/27/magazine/th...,https://www.nytimes.com/1997/07/27/magazine/th...
4,23782313,https://www.nytimes.com/2005/07/19/arts/televi...,https://www.nytimes.com/2005/07/19/arts/televi...
6,17288329,http://www.ancientlibrary.com/smith-bio/0075.html,http://www.ancientlibrary.com/smith-bio/0075.html


Some URLs appear encoded whereas others do not, so all are decoded.

In [13]:
df['URL_n'] = [urllib.parse.unquote(x) for x in df['URL_n']]

Then all URLs are normalized using the url_normalize Python package. If an error is returned no transformation is carried out. However, there are a few cases, as it is showed (127). Sometimes these erroneous URLs work but their structure is a bit strange, for this reason they are not removed.

In [14]:
urls = df['URL_n'].tolist()
err_urls = []

for x in range(len(urls)):
    try:
        urls[x] = url_normalize(urls[x])
    except:
        err_urls.append(urls[x])

print(len(err_urls))

127


Then, the normalized URLs are included in the data.frame.

In [15]:
df['URL_n'] = urls

After the normalization, the http is added, so it is removed again.

In [16]:
df['URL_n'] = [re.sub('^http://www[0-9]{0,2}\\.|^http://|^https://www[0-9]{0,2}\\.|^https://|^//www[0-9]{0,2}\\.', '', x) for x in df['URL_n']]

The problem with escape characters is solved.

In [17]:
df[df['id'] == 37852327]

Unnamed: 0,id,URL,URL_n
1769,37852327,http://www.oxforddnb.com/view/article/52801\n,oxforddnb.com/view/article/52801
6588248,37852327,http://www.oxforddnb.com/view/article/16103\n,oxforddnb.com/view/article/16103
20929578,37852327,http://www.oxforddnb.com/view/article/24964\n,oxforddnb.com/view/article/24964
21222439,37852327,http://www.oxforddnb.com/view/article/50229\n,oxforddnb.com/view/article/50229


## 2. Domains

Domain are obtained and the entire data.frame is filtered by normalized URL. Erroneous domains are removed.

In [18]:
df['domain'] = [re.sub('/.*', '', x) for x in df['URL_n']]
df = df[df['domain'] != '']
df = df[df['domain'].str.contains('.', regex=False)]
df.head()

Unnamed: 0,id,URL,URL_n,domain
0,39495284,http://www.columbia.edu/cu/lweb/digital/collec...,columbia.edu/cu/lweb/digital/collections/cul/t...,columbia.edu
1,3528721,http://allafrica.com/stories/200512190886.html,allafrica.com/stories/200512190886.html,allafrica.com
3,8552398,https://www.nytimes.com/1997/07/27/magazine/th...,nytimes.com/1997/07/27/magazine/the-thin-red-l...,nytimes.com
4,23782313,https://www.nytimes.com/2005/07/19/arts/televi...,nytimes.com/2005/07/19/arts/television/19heff....,nytimes.com
6,17288329,http://www.ancientlibrary.com/smith-bio/0075.html,ancientlibrary.com/smith-bio/0075.html,ancientlibrary.com


Network of Wikipedia pages that reference domains.

In [None]:
df_dom = df[['id', 'domain']]
df_dom = df_dom.drop_duplicates()
#df_dom.to_csv('references/domains.csv', index=False)

In [None]:
#sorted(set(df_dom['domain']), key=len)

In [None]:
df_dom

## 3. Top domains modifications

Taking into account the most mentioned domain, the following specific modifications have been carried out. Some of then do not require further modifications.

In [19]:
domains_freq = pd.value_counts(df.domain).to_frame().reset_index()
domains_freq[0:50]

Unnamed: 0,index,domain
0,books.google.com,923122
1,archive.org,423102
2,nytimes.com,286826
3,bbc.co.uk,223213
4,theguardian.com,192147
5,census.gov,183571
6,news.bbc.co.uk,166409
7,billboard.com,166383
8,youtube.com,150286
9,news.google.com,127003


The top 50 most mentioned domains represent the 20% of mentions.

In [20]:
100*sum(pd.value_counts(df.domain).to_frame().reset_index()['domain'][0:50])/sum(pd.value_counts(df.domain).to_frame().reset_index()['domain'])

20.0160866128398

There are a total of 1,590,478 different domains.

In [21]:
len(pd.value_counts(df.domain).to_frame().reset_index())

1590478

## Temporal files

In [None]:
df.to_csv('references/ref_url.csv', index=False)

In [None]:
df = pd.read_csv('references/ref_url.csv')

In [None]:
df

### Methodological notes

This is not an exhaustive process. Not all URLs are examined to detec all failures and normalize in a exhaustive way. In fact an alogithm was considered to clean all URLs, but i tas not possible. For instance, some website have change their URL structure over time (e.g. YouTube), others are case sensitive, some use parameters that in some cases are key (e.g. chart\?f in billboard), some websites have been removed or are region-restricted (e.g. sports-reference.com)...

When an URL include a parameter that points to a subsection inside the page, for instance a subheading or an album song, it is removed.

Clustering could be dangerous: worldcat.org/search?fq=yr%3A1906&q=butmi worldcat.org/search?fq=yr%3A1907&q=butmi

**Attention**: More URLs are examinated than showed. The are only showed as examples.

![title](URLs.png)

### 1. books.google.com

These URLs require a lot of transformations to be reduced.

In [22]:
list(df.loc[df['domain'] == 'books.google.com', 'URL_n'])[0:5]

['books.google.com/books?id=DlZAAAAAIAAJ&pg=PA1#v%3Donepage%26q%26f%3Dfalse',
 'books.google.com/?id=5367AAAAIAAJ',
 'books.google.com/books?id=-CojUQpSS6wC',
 'books.google.com/books?id=99IDuIGxckcC',
 'books.google.com/books?id=2KSbTcdYv9QC&pg=PA1426']

In [23]:
df.loc[df['domain'] == 'books.google.com', 'URL_n'] = [re.sub('\?.*&id=', '?id=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.com', 'URL_n']]
df.loc[df['domain'] == 'books.google.com', 'URL_n'] = [re.sub('(\?id=.*?&)(.*)', r'\1', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.com', 'URL_n']]
df.loc[df['domain'] == 'books.google.com', 'URL_n'] = [re.sub('\?.*&q=', '?q=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.com', 'URL_n']]
df.loc[df['domain'] == 'books.google.com', 'URL_n'] = [re.sub('ngrams/graph\?.*&content=', 'ngrams/graph?content=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.com', 'URL_n']]
df.loc[df['domain'] == 'books.google.com', 'URL_n'] = [re.sub('&.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.com', 'URL_n']]
df.loc[df['domain'] == 'books.google.com', 'URL_n'] = [re.sub('/books.*\?id=', '/?id=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.com', 'URL_n']]
df.loc[df['domain'] == 'books.google.com', 'URL_n'] = [re.sub('/books.*\?vid=', '/?vid=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.com', 'URL_n']]
df.loc[df['domain'] == 'books.google.com', 'URL_n'] = [re.sub('/books.*\?q=', '/?q=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.com', 'URL_n']]
df.loc[df['domain'] == 'books.google.com', 'URL_n'] = [re.sub('#(v|search).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.com', 'URL_n']]

In [24]:
list(df[df['domain'] == 'books.google.com']['URL_n'])[0:5]

['books.google.com/?id=DlZAAAAAIAAJ',
 'books.google.com/?id=5367AAAAIAAJ',
 'books.google.com/?id=-CojUQpSS6wC',
 'books.google.com/?id=99IDuIGxckcC',
 'books.google.com/?id=2KSbTcdYv9QC']

### 2. archive.org

Some tags are removed.

In [25]:
list(df.loc[df['domain'] == 'archive.org', 'URL_n'])[0:5]

['archive.org/details/recordsofgeneral00gene/page/n7',
 'archive.org/details/eastasiacultural00ebre_0',
 'archive.org/details/womenpoliticsinc0000maci/page/242',
 'archive.org/details/beyondselfintere0000unse/page/133',
 'archive.org/details/stayingonalone00alic']

In [26]:
df.loc[df['domain'] == 'archive.org', 'URL_n'] = [re.sub('/page/.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'archive.org', 'URL_n']]
df.loc[df['domain'] == 'archive.org', 'URL_n'] = [re.sub('#(page|mode|start).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'archive.org', 'URL_n']]

In [27]:
list(df.loc[df['domain'] == 'archive.org', 'URL_n'])[0:5]

['archive.org/details/recordsofgeneral00gene',
 'archive.org/details/eastasiacultural00ebre_0',
 'archive.org/details/womenpoliticsinc0000maci',
 'archive.org/details/beyondselfintere0000unse',
 'archive.org/details/stayingonalone00alic']

### 3. nytimes.com

Removed after the .html.

In [28]:
list(df.loc[df['domain'] == 'nytimes.com', 'URL_n'])[0:5]

['nytimes.com/1997/07/27/magazine/the-thin-red-line.html?pagewanted=2',
 'nytimes.com/2005/07/19/arts/television/19heff.html',
 'nytimes.com/2009/09/10/science/10fish.html',
 'nytimes.com/1986/06/18/us/the-supreme-court-man-in-the-news-judge-with-tenacity-and-charm-antonin-scalia.html',
 'nytimes.com/2009/11/13/us/13sftattoo.html?pagewanted=all']

In [29]:
df.loc[df['domain'] == 'nytimes.com', 'URL_n'] = [re.sub('\.html\?.*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'nytimes.com', 'URL_n']]
df.loc[df['domain'] == 'nytimes.com', 'URL_n'] = [re.sub('\.html#.*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'nytimes.com', 'URL_n']]

In [30]:
list(df.loc[df['domain'] == 'nytimes.com', 'URL_n'])[0:5]

['nytimes.com/1997/07/27/magazine/the-thin-red-line.html',
 'nytimes.com/2005/07/19/arts/television/19heff.html',
 'nytimes.com/2009/09/10/science/10fish.html',
 'nytimes.com/1986/06/18/us/the-supreme-court-man-in-the-news-judge-with-tenacity-and-charm-antonin-scalia.html',
 'nytimes.com/2009/11/13/us/13sftattoo.html']

### 4. bbc.co.uk

Some old URLs can include parameters, but they cannot be removed.

In [31]:
list(df.loc[df['domain'] == 'bbc.co.uk', 'URL_n'])[0:5]

['bbc.co.uk/bbc7/comedy/progpages/pennydreadfuls.shtml',
 'bbc.co.uk/programmes/p0093q14',
 'bbc.co.uk/cult/buffy/angel/episodes/two/page12.shtml',
 'bbc.co.uk/programmes/b00vrt97',
 'bbc.co.uk/programmes/b04tlj7g']

In [32]:
df.loc[df['domain'] == 'bbc.co.uk', 'URL_n'] = [re.sub('\?(ocid|intlink|print|ns_).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'bbc.co.uk', 'URL_n']]
df.loc[df['domain'] == 'bbc.co.uk', 'URL_n'] = [re.sub('#TWEET.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'bbc.co.uk', 'URL_n']]

In [33]:
list(df.loc[df['domain'] == 'bbc.co.uk', 'URL_n'])[0:5]

['bbc.co.uk/bbc7/comedy/progpages/pennydreadfuls.shtml',
 'bbc.co.uk/programmes/p0093q14',
 'bbc.co.uk/cult/buffy/angel/episodes/two/page12.shtml',
 'bbc.co.uk/programmes/b00vrt97',
 'bbc.co.uk/programmes/b04tlj7g']

### 5. theguardian.com

Some parameters are not useful.

In [34]:
list(df.loc[df['domain'] == 'theguardian.com', 'URL_n'])[0:5]

['theguardian.com/world/2011/dec/04/slovenia-election-ljubljana-mayor-wins',
 'theguardian.com/artanddesign/2011/oct/17/glasgow-turner-prize',
 'theguardian.com/media/2004/jun/25/broadcasting.ITV',
 'theguardian.com/theguardian/2009/mar/09/archive-1973-london-ira-bombs',
 'theguardian.com/northerner/idx/0,,2292170,00.html']

In [35]:
df.loc[df['domain'] == 'theguardian.com', 'URL_n'] = [re.sub('\?(INTCMP=|newsfeed=|cmp=|feed=).*#', '#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'theguardian.com', 'URL_n']]
df.loc[df['domain'] == 'theguardian.com', 'URL_n'] = [re.sub('\?(INTCMP=|newsfeed=|cmp=|feed=).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'theguardian.com', 'URL_n']]

In [36]:
list(df.loc[df['domain'] == 'theguardian.com', 'URL_n'])[0:5]

['theguardian.com/world/2011/dec/04/slovenia-election-ljubljana-mayor-wins',
 'theguardian.com/artanddesign/2011/oct/17/glasgow-turner-prize',
 'theguardian.com/media/2004/jun/25/broadcasting.ITV',
 'theguardian.com/theguardian/2009/mar/09/archive-1973-london-ira-bombs',
 'theguardian.com/northerner/idx/0,,2292170,00.html']

### 6. census.gov

No changes required.

In [37]:
list(df.loc[df['domain'] == 'census.gov', 'URL_n'])[0:5]

['census.gov/', 'census.gov/', 'census.gov/', 'census.gov/', 'census.gov/']

### 7. news.bbc.co.uk

Some parameters are not useful.

In [38]:
list(df.loc[df['domain'] == 'news.bbc.co.uk', 'URL_n'])[0:5]

['news.bbc.co.uk/1/hi/england/lancashire/8367849.stm',
 'news.bbc.co.uk/sport1/hi/football/teams/q/qpr/6238710.stm',
 'news.bbc.co.uk/1/hi/england/west_midlands/7278713.stm',
 'news.bbc.co.uk/1/hi/8670037.stm',
 'news.bbc.co.uk/1/hi/england/lincolnshire/6116660.stm']

In [39]:
df.loc[df['domain'] == 'news.bbc.co.uk', 'URL_n'] = [re.sub('\?([a-zA-Z]{1,3}$|rss=.*|from=.*)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'news.bbc.co.uk', 'URL_n']]

In [40]:
list(df.loc[df['domain'] == 'news.bbc.co.uk', 'URL_n'])[0:5]

['news.bbc.co.uk/1/hi/england/lancashire/8367849.stm',
 'news.bbc.co.uk/sport1/hi/football/teams/q/qpr/6238710.stm',
 'news.bbc.co.uk/1/hi/england/west_midlands/7278713.stm',
 'news.bbc.co.uk/1/hi/8670037.stm',
 'news.bbc.co.uk/1/hi/england/lincolnshire/6116660.stm']

### 8. billboard.com

chart/?f= is used to select charts, it cannot be removed. There are also other parameters, such as order, begin page or rank, that can alter the page.

In [41]:
list(df.loc[df['domain'] == 'billboard.com', 'URL_n'])[0:5]

['billboard.com/articles/business/chart-beat/8540112/babymetal-metal-galaxy-no-1-top-rock-albums',
 'billboard.com/articles/columns/k-town/8046914/bts-steve-aoki-k-pop-drop-mic-drop-remix-desiigner-edm',
 'billboard.com/articles/news/8496682/chrissy-teigen-kanye-john-legend-drama-behindthetweets-video',
 'billboard.com/biz/articles/news/global/5645344/imagem-music-uk-sign-cathy-dennis-to-global-deal',
 'billboard.com/biz/articles/news/global/5645344/imagem-music-uk-sign-cathy-dennis-to-global-deal']

### 9. youtube.com

Specific time parameter and unuseful parameters are removed.

In [42]:
list(df.loc[df['domain'] == 'youtube.com', 'URL_n'])[0:5]

['youtube.com/watch?v=b9iBulMMfLI',
 'youtube.com/watch?v=PSFUW_rwkr8',
 'youtube.com/watch?v=TG7KBuzLqfE',
 'youtube.com/watch?v=iP3MuvYnDok',
 'youtube.com/watch?v=CDoT2pmtQYA']

In [43]:
df.loc[df['domain'] == 'youtube.com', 'URL_n'] = [re.sub('\?t=.*&', '?', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'youtube.com', 'URL_n']]
df.loc[df['domain'] == 'youtube.com', 'URL_n'] = [re.sub('\?t=.*$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'youtube.com', 'URL_n']]
df.loc[df['domain'] == 'youtube.com', 'URL_n'] = [re.sub('\?.*&v=', '?v=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'youtube.com', 'URL_n']]
df.loc[df['domain'] == 'youtube.com', 'URL_n'] = [re.sub('\?.*&search_query=', '?search_query=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'youtube.com', 'URL_n']]
df.loc[df['domain'] == 'youtube.com', 'URL_n'] = [re.sub('&.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'youtube.com', 'URL_n']]
df.loc[df['domain'] == 'youtube.com', 'URL_n'] = [re.sub('\?(flow|feat).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'youtube.com', 'URL_n']]
df.loc[df['domain'] == 'youtube.com', 'URL_n'] = [re.sub('(;t=|#t|#at).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'youtube.com', 'URL_n']]

In [44]:
list(df.loc[df['domain'] == 'youtube.com', 'URL_n'])[0:5]

['youtube.com/watch?v=b9iBulMMfLI',
 'youtube.com/watch?v=PSFUW_rwkr8',
 'youtube.com/watch?v=TG7KBuzLqfE',
 'youtube.com/watch?v=iP3MuvYnDok',
 'youtube.com/watch?v=CDoT2pmtQYA']

### 10. news.google.com

In this case, there is not a clear parameter, because if sjid, nid or id are removed the URL not works. For example in news.google.com/newspapers?nid=1873&dat=19610313&id=qpkoAAAAIBAJ&sjid=h8wEAAAAIBAJ&pg=6269,2359408. But the language (hl) and highlighted terms (dq) can be removed.

In [45]:
list(df.loc[df['domain'] == 'news.google.com', 'URL_n'])[0:5]

['news.google.com/newspapers?dq=prince-william-of-baden&hl=en&id=xaU9AAAAIBAJ&pg=5655,5310963&sjid=DykDAAAAIBAJ',
 'news.google.com/newspapers?dat=19100515&hl=en&nid=0oeUc68sgesC&printsec=frontpage',
 'news.google.com/newspapers?id=WvZUAAAAIBAJ&pg=4936,3495665&sjid=eJIDAAAAIBAJ',
 'news.google.com/newspapers?id=o4lDAAAAIBAJ&pg=3185,3405383&sjid=da4MAAAAIBAJ',
 'news.google.com/newspapers?dq=columbus+panhandles&hl=en&id=mypKAAAAIBAJ&pg=4140,5163320&sjid=-YUMAAAAIBAJ']

In [46]:
df.loc[df['domain'] == 'news.google.com', 'URL_n'] = [re.sub('(hl=.*?&|&hl=[a-zA-Z-]{2,6}$)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'news.google.com', 'URL_n']]
df.loc[df['domain'] == 'news.google.com', 'URL_n'] = [re.sub('\?dq=.*?&', '?', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'news.google.com', 'URL_n']]

In [47]:
list(df.loc[df['domain'] == 'news.google.com', 'URL_n'])[0:5]

['news.google.com/newspapers?id=xaU9AAAAIBAJ&pg=5655,5310963&sjid=DykDAAAAIBAJ',
 'news.google.com/newspapers?dat=19100515&nid=0oeUc68sgesC&printsec=frontpage',
 'news.google.com/newspapers?id=WvZUAAAAIBAJ&pg=4936,3495665&sjid=eJIDAAAAIBAJ',
 'news.google.com/newspapers?id=o4lDAAAAIBAJ&pg=3185,3405383&sjid=da4MAAAAIBAJ',
 'news.google.com/newspapers?id=mypKAAAAIBAJ&pg=4140,5163320&sjid=-YUMAAAAIBAJ']

### 11. newspapers.com

In this case some URLs can include parameters

In [48]:
list(df.loc[df['domain'] == 'newspapers.com', 'URL_n'])[0:5]

['newspapers.com/image/53909207',
 'newspapers.com/clip/38636482/hollywood',
 'newspapers.com/clip/7518073/charles_monett_obituary_oct_10_1888',
 'newspapers.com/clip/10887596/death_announcement_and_life_story_of',
 'newspapers.com/clip/12596707/det_free_press_20_jun_1896_sat_p3death']

In [49]:
df.loc[df['domain'] == 'newspapers.com', 'URL_n'] = [re.sub('(/[0-9].*/)([a-zA-Z0-9?].*$)', r'\1', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'newspapers.com', 'URL_n']]
df.loc[df['domain'] == 'newspapers.com', 'URL_n'] = [re.sub('fcfToken=.*?&', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'newspapers.com', 'URL_n']]
df.loc[df['domain'] == 'newspapers.com', 'URL_n'] = [re.sub('&fcfToken=.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'newspapers.com', 'URL_n']]
df.loc[df['domain'] == 'newspapers.com', 'URL_n'] = [re.sub('/$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'newspapers.com', 'URL_n']]

In [50]:
list(df.loc[df['domain'] == 'newspapers.com', 'URL_n'])[0:5]

['newspapers.com/image/53909207',
 'newspapers.com/clip/38636482',
 'newspapers.com/clip/7518073',
 'newspapers.com/clip/10887596',
 'newspapers.com/clip/12596707']

### 12. allmusic.com

One tag is removed.

In [51]:
list(df.loc[df['domain'] == 'allmusic.com', 'URL_n'])[0:5]

['allmusic.com/artist/the-spencer-davis-group-mn0000482600',
 'allmusic.com/artist/terry-gerry-mn0001807180/biography',
 'allmusic.com/album/new-mother-mw0000256935/credits',
 'allmusic.com/artist/big-l-p144340/charts-awards/billboard-singles',
 'allmusic.com/artist/cyro-baptista-mn0000630439/credits']

In [52]:
df.loc[df['domain'] == 'allmusic.com', 'URL_n'] = [re.sub('#no-js$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'allmusic.com', 'URL_n']]

In [53]:
list(df.loc[df['domain'] == 'allmusic.com', 'URL_n'])[0:5]

['allmusic.com/artist/the-spencer-davis-group-mn0000482600',
 'allmusic.com/artist/terry-gerry-mn0001807180/biography',
 'allmusic.com/album/new-mother-mw0000256935/credits',
 'allmusic.com/artist/big-l-p144340/charts-awards/billboard-singles',
 'allmusic.com/artist/cyro-baptista-mn0000630439/credits']

### 13. nla.gov.au

Some parameters are only use to highlight some terms.

In [54]:
list(df.loc[df['domain'] == 'nla.gov.au', 'URL_n'])[0:5]

['nla.gov.au/nla.party-607386',
 'nla.gov.au/nla.news-article2215553',
 'nla.gov.au/nla.news-article225613522',
 'nla.gov.au/nla.news-article32142885',
 'nla.gov.au/nla.news-article37697593']

In [55]:
df.loc[df['domain'] == 'nla.gov.au', 'URL_n'] = [re.sub('(#page|#pstart).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'nla.gov.au', 'URL_n']]
df.loc[df['domain'] == 'nla.gov.au', 'URL_n'] = [re.sub('(\?search|\?zoom|/view).*#', '#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'nla.gov.au', 'URL_n']]
df.loc[df['domain'] == 'nla.gov.au', 'URL_n'] = [re.sub('(\?search|\?zoom|/view).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'nla.gov.au', 'URL_n']]

In [56]:
list(df.loc[df['domain'] == 'nla.gov.au', 'URL_n'])[0:5]

['nla.gov.au/nla.party-607386',
 'nla.gov.au/nla.news-article2215553',
 'nla.gov.au/nla.news-article225613522',
 'nla.gov.au/nla.news-article32142885',
 'nla.gov.au/nla.news-article37697593']

### 14. washingtonpost.com

Some parameters are removed.

In [57]:
list(df.loc[df['domain'] == 'washingtonpost.com', 'URL_n'])[0:5]

['washingtonpost.com/wp-dyn/content/article/2007/08/22/AR2007082200970.html',
 'washingtonpost.com/news/wonkblog/wp/2015/10/01/the-hidden-inequality-of-who-dies-in-car-crashes',
 'washingtonpost.com/wp-srv/local/daily/april99/supermax18.htm',
 'washingtonpost.com/wp-dyn/content/article/2006/05/28/AR2006052801090.html',
 'washingtonpost.com/blogs/football-insider/wp/2013/11/08/chris-thompson-headed-for-ir-with-shoulder-surgery']

In [58]:
df.loc[df['domain'] == 'washingtonpost.com', 'URL_n'] = [re.sub('(/?)\?(language|noredirect|wprss|tid|sid|hpid|nav|sub|referrer|arc404|utm_term|wp).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'washingtonpost.com', 'URL_n']]

In [59]:
list(df.loc[df['domain'] == 'washingtonpost.com', 'URL_n'])[0:5]

['washingtonpost.com/wp-dyn/content/article/2007/08/22/AR2007082200970.html',
 'washingtonpost.com/news/wonkblog/wp/2015/10/01/the-hidden-inequality-of-who-dies-in-car-crashes',
 'washingtonpost.com/wp-srv/local/daily/april99/supermax18.htm',
 'washingtonpost.com/wp-dyn/content/article/2006/05/28/AR2006052801090.html',
 'washingtonpost.com/blogs/football-insider/wp/2013/11/08/chris-thompson-headed-for-ir-with-shoulder-surgery']

### 15. telegraph.co.uk

Some parameters are removed.

In [60]:
list(df.loc[df['domain'] == 'telegraph.co.uk', 'URL_n'])[0:5]

['telegraph.co.uk/journalists/bernadette-mcnulty/3561197/The-Streets-will-make-only-one-more-album-says-Mike-Skinner.html',
 'telegraph.co.uk/news/worldnews/asia/southkorea/7890316/South-Korea-targets-Japanese-collaborators-descendants.html',
 'telegraph.co.uk/news/2018/10/01/north-south-korea-begin-removing-landmines-along-fortified-border',
 'telegraph.co.uk/sport/football/teams/bolton-wanderers/9786281/Bolton-Wanderers-manager-Dougie-Freedman-reveals-Marvin-Sordells-Twitter-obsession.html',
 'telegraph.co.uk/comment/telegraph-view/8474987/William-Morris-and-the-joy-of-giving.html']

In [61]:
df.loc[df['domain'] == 'telegraph.co.uk', 'URL_n'] = [re.sub('\.html\?(utm_|WT|li_medium).*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'telegraph.co.uk', 'URL_n']]
df.loc[df['domain'] == 'telegraph.co.uk', 'URL_n'] = [re.sub('[/]{0,1}\?(utm_|WT|li_medium).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'telegraph.co.uk', 'URL_n']]
df.loc[df['domain'] == 'telegraph.co.uk', 'URL_n'] = [re.sub('#disqus_thread$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'telegraph.co.uk', 'URL_n']]

In [62]:
list(df.loc[df['domain'] == 'telegraph.co.uk', 'URL_n'])[0:5]

['telegraph.co.uk/journalists/bernadette-mcnulty/3561197/The-Streets-will-make-only-one-more-album-says-Mike-Skinner.html',
 'telegraph.co.uk/news/worldnews/asia/southkorea/7890316/South-Korea-targets-Japanese-collaborators-descendants.html',
 'telegraph.co.uk/news/2018/10/01/north-south-korea-begin-removing-landmines-along-fortified-border',
 'telegraph.co.uk/sport/football/teams/bolton-wanderers/9786281/Bolton-Wanderers-manager-Dougie-Freedman-reveals-Marvin-Sordells-Twitter-obsession.html',
 'telegraph.co.uk/comment/telegraph-view/8474987/William-Morris-and-the-joy-of-giving.html']

### 16. espncricinfo.com

In this case, if ".html" is removed, the URL does not work, but it allows to unify. Only specific parameters can be removed.

In [63]:
list(df.loc[df['domain'] == 'espncricinfo.com', 'URL_n'])[0:5]

['espncricinfo.com/ci/engine/match/65167.html',
 'espncricinfo.com/ci/engine/match/67264.html',
 'espncricinfo.com/magazine/content/story/438795.html',
 'espncricinfo.com/story/_/id/18289545/steven-smith-owns-edge-97',
 'espncricinfo.com/wcldiv4/content/story/473698.html?object=460181']

In [64]:
df.loc[df['domain'] == 'espncricinfo.com', 'URL_n'] = [re.sub('(\.html\?template|\.html\?view|\.html\?cmp|\.html\?version|\.html\?comments|\.html\?ex_cid|\.html\?innings|\.html\?index).*#', '.html#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'espncricinfo.com', 'URL_n']]
df.loc[df['domain'] == 'espncricinfo.com', 'URL_n'] = [re.sub('(\.html\?template|\.html\?view|\.html\?cmp|\.html\?version|\.html\?comments|\.html\?ex_cid|\.html\?innings|\.html\?index).*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'espncricinfo.com', 'URL_n']]

In [65]:
list(df.loc[df['domain'] == 'espncricinfo.com', 'URL_n'])[0:5]

['espncricinfo.com/ci/engine/match/65167.html',
 'espncricinfo.com/ci/engine/match/67264.html',
 'espncricinfo.com/magazine/content/story/438795.html',
 'espncricinfo.com/story/_/id/18289545/steven-smith-owns-edge-97',
 'espncricinfo.com/wcldiv4/content/story/473698.html?object=460181']

### 17. imdb.com

Here sometimes the same resource are linked but pointing to different sections of it, for example a specific year of awards or the cast of a movie.

In [66]:
list(df.loc[df['domain'] == 'imdb.com', 'URL_n'])[0:5]

['imdb.com/title/tt0166593/awards',
 'imdb.com/title/tt6080232/?ref_=adv_li_tt',
 'imdb.com/title/tt6409234/?ref_=adv_li_tt',
 'imdb.com/title/tt5593606/?ref_=nm_flmg_dr_1',
 'imdb.com/title/tt5604272/combined']

In [67]:
df.loc[df['domain'] == 'imdb.com', 'URL_n'] = [re.sub('\?(ref_|ref)=[a-z0-9_]+&', '?', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'imdb.com', 'URL_n']]
df.loc[df['domain'] == 'imdb.com', 'URL_n'] = [re.sub('&(ref_|ref)=[a-z0-9_]+&', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'imdb.com', 'URL_n']]
df.loc[df['domain'] == 'imdb.com', 'URL_n'] = [re.sub('(\?|&)(ref_|ref)=[a-z0-9_]+#', '#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'imdb.com', 'URL_n']]
df.loc[df['domain'] == 'imdb.com', 'URL_n'] = [re.sub('(\?|&)(ref_|ref)=[a-z0-9_]+$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'imdb.com', 'URL_n']]
df.loc[df['domain'] == 'imdb.com', 'URL_n'] = [re.sub('\?mode=[a-z]+&', '?', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'imdb.com', 'URL_n']]
df.loc[df['domain'] == 'imdb.com', 'URL_n'] = [re.sub('&mode=[a-z]+&', '&', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'imdb.com', 'URL_n']]
df.loc[df['domain'] == 'imdb.com', 'URL_n'] = [re.sub('(\?|&)mode=[a-z]+#', '#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'imdb.com', 'URL_n']]
df.loc[df['domain'] == 'imdb.com', 'URL_n'] = [re.sub('(\?|&)mode=[a-z]+$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'imdb.com', 'URL_n']]
df.loc[df['domain'] == 'imdb.com', 'URL_n'] = [re.sub('\?(&+)', '?', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'imdb.com', 'URL_n']]

In [68]:
list(df.loc[df['domain'] == 'imdb.com', 'URL_n'])[0:5]

['imdb.com/title/tt0166593/awards',
 'imdb.com/title/tt6080232/',
 'imdb.com/title/tt6409234/',
 'imdb.com/title/tt5593606/',
 'imdb.com/title/tt5604272/combined']

### 18. independent.co.uk

If ".hmtl" is removed, in some cases, the URL does not work, but it makes possible to unify URL.

In [69]:
list(df.loc[df['domain'] == 'independent.co.uk', 'URL_n'])[0:5]

['independent.co.uk/arts-entertainment/books/news/delight-as-lost-enid-blyton-book-is-discovered-2222818.html',
 'independent.co.uk/news/education/education-news/new-head-master-of-eton-is-an-old-boy-678919.html',
 'independent.co.uk/sport/football/premier-league/newcastle-scramble-to-find-robson-replacement-558471.html',
 'independent.co.uk/sport/football/worldcup/stuart-pearce-takes-positives-from-under21-defeat-2136476.html']

In [70]:
df.loc[df['domain'] == 'independent.co.uk', 'URL_n'] = [re.sub('\.html\?(amp|CMP|print|origin|dkdkd|service|r=).*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'independent.co.uk', 'URL_n']]

In [71]:
list(df.loc[df['domain'] == 'independent.co.uk', 'URL_n'])[0:5]

['independent.co.uk/arts-entertainment/books/news/delight-as-lost-enid-blyton-book-is-discovered-2222818.html',
 'independent.co.uk/news/education/education-news/new-head-master-of-eton-is-an-old-boy-678919.html',
 'independent.co.uk/sport/football/premier-league/newcastle-scramble-to-find-robson-replacement-558471.html',
 'independent.co.uk/sport/football/worldcup/stuart-pearce-takes-positives-from-under21-defeat-2136476.html']

### 19. variety.com

The use of parameters looks associated to an old web version. Parameters included in the *variety.com/index.asp* URL cannot be removed as well as category. In some cases if the category parameter is removed it doesn't work.

In [72]:
list(df.loc[df['domain'] == 'variety.com', 'URL_n'])[0:5]

['variety.com/review/VE1117934225.html?categoryid=31&cs=1',
 'variety.com/2017/film/news/2018-sag-award-nominations-list-nominees-1202638033',
 'variety.com/2017/film/news/steven-spielberg-tom-hanks-meryl-streep-oscars-1202392948',
 'variety.com/2018/film/news/saturn-awards-nominations-2018-black-panther-walking-dead-1202727752',
 'variety.com/2018/music/news/drake-scorpion-michael-jackson-vocal-newborn-son-1202861457']

In [None]:
#df.loc[df['domain'] == 'variety.com', 'URL_n'] = [re.sub('\.html\?.*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'variety.com', 'URL_n']]
#df.loc[df['domain'] == 'variety.com', 'URL_n'] = [re.sub('([0-9]|/)(\?.*)', r'\1', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'variety.com', 'URL_n']]
#df.loc[df['domain'] == 'variety.com', 'URL_n'] = [re.sub('\?(refcatid|categoryid|printerfriendly).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'variety.com', 'URL_n']]

In [73]:
list(df.loc[df['domain'] == 'variety.com', 'URL_n'])[0:5]

['variety.com/review/VE1117934225.html?categoryid=31&cs=1',
 'variety.com/2017/film/news/2018-sag-award-nominations-list-nominees-1202638033',
 'variety.com/2017/film/news/steven-spielberg-tom-hanks-meryl-streep-oscars-1202392948',
 'variety.com/2018/film/news/saturn-awards-nominations-2018-black-panther-walking-dead-1202727752',
 'variety.com/2018/music/news/drake-scorpion-michael-jackson-vocal-newborn-son-1202861457']

### 20. sports-reference.com

No changes required.

In [74]:
list(df.loc[df['domain'] == 'sports-reference.com', 'URL_n'])[0:5]

['sports-reference.com/olympics/countries/GDR/summer/1968/CAN',
 'sports-reference.com/olympics/countries/GBR/summer/1980/ROW',
 'sports-reference.com/olympics/athletes/gr/louis-gressier-1.html',
 'sports-reference.com/olympics/athletes/bo/marcel-boigegrain-1.html',
 'sports-reference.com/olympics/athletes/bo/renate-boy-garisch-culmberger-1.html']

### 21. itunes.apple.com

There are some parameters that could be included. mt and l (language) looks unuseful whereas i is for specific album songs. #see-all and #fullText would be removed.

In [75]:
list(df.loc[df['domain'] == 'itunes.apple.com', 'URL_n'])[0:5]

['itunes.apple.com/us/album/son-of-a-pimp-pt-2/1106027496?i=1106028125',
 'itunes.apple.com/us/album/all-mine-single/1166331833',
 'itunes.apple.com/us/album/bad-bitch-commandments/1443985955',
 'itunes.apple.com/us/album/dreamland/1091810836',
 'itunes.apple.com/au/album/monstercat-5-year-anniversary/1130112867']

In [76]:
df.loc[df['domain'] == 'itunes.apple.com', 'URL_n'] = [re.sub('(\?mt=|&mt=|\?l=|&l=).*#', '#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'itunes.apple.com', 'URL_n']]
df.loc[df['domain'] == 'itunes.apple.com', 'URL_n'] = [re.sub('(\?mt=|&mt=).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'itunes.apple.com', 'URL_n']]
df.loc[df['domain'] == 'itunes.apple.com', 'URL_n'] = [re.sub('(\?l=|&l=).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'itunes.apple.com', 'URL_n']]
df.loc[df['domain'] == 'itunes.apple.com', 'URL_n'] = [re.sub('\?app.*?&', '?', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'itunes.apple.com', 'URL_n']]
df.loc[df['domain'] == 'itunes.apple.com', 'URL_n'] = [re.sub('\?app.*#', '#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'itunes.apple.com', 'URL_n']]
df.loc[df['domain'] == 'itunes.apple.com', 'URL_n'] = [re.sub('\?app.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'itunes.apple.com', 'URL_n']]

In [77]:
list(df.loc[df['domain'] == 'itunes.apple.com', 'URL_n'])[0:5]

['itunes.apple.com/us/album/son-of-a-pimp-pt-2/1106027496?i=1106028125',
 'itunes.apple.com/us/album/all-mine-single/1166331833',
 'itunes.apple.com/us/album/bad-bitch-commandments/1443985955',
 'itunes.apple.com/us/album/dreamland/1091810836',
 'itunes.apple.com/au/album/monstercat-5-year-anniversary/1130112867']

### 22. tvbythenumbers.zap2it.com

No changes required.

In [78]:
list(df.loc[df['domain'] == 'tvbythenumbers.zap2it.com', 'URL_n'])[0:5]

['tvbythenumbers.zap2it.com/2010/09/10/nikita-takes-aim-at-record-ratings-for-the-cw-matching-all-time-highs-in-time-period-with-women-18-34/62886',
 'tvbythenumbers.zap2it.com/dvr-ratings/week-33-broadcast-live-7-ratings-may-1-7-2017',
 'tvbythenumbers.zap2it.com/2015/04/09/wednesday-final-ratings-survivor-the-voice-the-middle-criminal-minds-csicyber-and-the-goldbergs-adjusted-up/386711',
 'tvbythenumbers.zap2it.com/2015/06/02/sunday-cable-ratings-game-of-thrones-tops-night-keeping-up-with-the-kardashians-silicon-valley-naked-and-afraid-more/411032',
 'tvbythenumbers.zap2it.com/2016/05/09/broadcast-live-7-ratings-april-18-24-2016']

### 23. timesofindia.indiatimes.com

Some parameters are removed.

In [79]:
list(df.loc[df['domain'] == 'timesofindia.indiatimes.com', 'URL_n'])[0:5]

['timesofindia.indiatimes.com/entertainment/hindi/movie-details/3-dev/movieshow/63499260.cms',
 'timesofindia.indiatimes.com/entertainment/telugu/movie-reviews/neevevaro/movie-review/65525405.cms',
 'timesofindia.indiatimes.com/city/chennai/Bid-to-re-release-Sivaji-classic-ends-up-in-court/articleshow/15485560.cms',
 'timesofindia.indiatimes.com/entertainment/hindi/bollywood/news/Motherland-turns-25/articleshow/3090758.cms',
 'timesofindia.indiatimes.com/entertainment/telugu/movies/news/Preserving-Tollywoods-timeless-classics/articleshow/30226239.cms']

In [80]:
df.loc[df['domain'] == 'timesofindia.indiatimes.com', 'URL_n'] = [re.sub('\.cms\?(referral=|intenttarget=|null|from=).*', '.cms', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'timesofindia.indiatimes.com', 'URL_n']]
df.loc[df['domain'] == 'timesofindia.indiatimes.com', 'URL_n'] = [re.sub('\.cms(#write$|#ixzz.*)', '.cms', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'timesofindia.indiatimes.com', 'URL_n']]

In [81]:
list(df.loc[df['domain'] == 'timesofindia.indiatimes.com', 'URL_n'])[0:5]

['timesofindia.indiatimes.com/entertainment/hindi/movie-details/3-dev/movieshow/63499260.cms',
 'timesofindia.indiatimes.com/entertainment/telugu/movie-reviews/neevevaro/movie-review/65525405.cms',
 'timesofindia.indiatimes.com/city/chennai/Bid-to-re-release-Sivaji-classic-ends-up-in-court/articleshow/15485560.cms',
 'timesofindia.indiatimes.com/entertainment/hindi/bollywood/news/Motherland-turns-25/articleshow/3090758.cms',
 'timesofindia.indiatimes.com/entertainment/telugu/movies/news/Preserving-Tollywoods-timeless-classics/articleshow/30226239.cms']

### 24. baseball-reference.com

No changes required. Some # have change and they do not work.

In [82]:
list(df.loc[df['domain'] == 'baseball-reference.com', 'URL_n'])[0:5]

['baseball-reference.com/leagues/MLB/misc.shtml',
 'baseball-reference.com/players/j/johnssi01.shtml',
 'baseball-reference.com/players/t/taylosa02.shtml',
 'baseball-reference.com/bullpen/Clear_Lake_Fish_Eaters',
 'baseball-reference.com/bullpen/Pearisburg,_VA']

### 25. bbc.com

The main unuseful parameters are removed.

In [83]:
list(df.loc[df['domain'] == 'bbc.com', 'URL_n'])[0:5]

['bbc.com/news/blogs-trending-41384799',
 'bbc.com/news/health-52018477',
 'bbc.com/sport/0/football/34667042',
 'bbc.com/sport/football/16632432',
 'bbc.com/news/uk-england-beds-bucks-herts-42037832']

In [84]:
df.loc[df['domain'] == 'bbc.com', 'URL_n'] = [re.sub('\?(ocid|intlink|print|ns_).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'bbc.com', 'URL_n']]
df.loc[df['domain'] == 'bbc.com', 'URL_n'] = [re.sub('#TWEET.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'bbc.com', 'URL_n']]

In [85]:
list(df.loc[df['domain'] == 'bbc.com', 'URL_n'])[0:5]

['bbc.com/news/blogs-trending-41384799',
 'bbc.com/news/health-52018477',
 'bbc.com/sport/0/football/34667042',
 'bbc.com/sport/football/16632432',
 'bbc.com/news/uk-england-beds-bucks-herts-42037832']

### 26. stat.gov.pl

The index is fixed.

In [86]:
list(df.loc[df['domain'] == 'stat.gov.pl', 'URL_n'])[0:5] 

['stat.gov.pl/download/gfx/portalinformacyjny/pl/defaultaktualnosci/5468/6/21/1/stan_i_struktura_ludno_oraz_ruch_naturalny_w_przekroju_teryt_stan-na-31-12-2016.pdf',
 'stat.gov.pl/en/topics/population/population/population-size-and-structure-and-vital-statistics-in-poland-by-territorial-divison-in-2019-as-of-30th-june,3,26.html',
 'stat.gov.pl/en/topics/population/population/population-size-and-structure-and-vital-statistics-in-poland-by-territorial-divison-in-2019-as-of-30th-june,3,26.html',
 'stat.gov.pl/broker/access/prefile/listPreFiles.jspa',
 'stat.gov.pl/broker/access/prefile/listPreFiles.jspa']

In [87]:
df.loc[df['domain'] == 'stat.gov.pl', 'URL_n'] = [re.sub('\.html\?p_name.*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'stat.gov.pl', 'URL_n']]

In [88]:
list(df.loc[df['domain'] == 'stat.gov.pl', 'URL_n'])[0:5] 

['stat.gov.pl/download/gfx/portalinformacyjny/pl/defaultaktualnosci/5468/6/21/1/stan_i_struktura_ludno_oraz_ruch_naturalny_w_przekroju_teryt_stan-na-31-12-2016.pdf',
 'stat.gov.pl/en/topics/population/population/population-size-and-structure-and-vital-statistics-in-poland-by-territorial-divison-in-2019-as-of-30th-june,3,26.html',
 'stat.gov.pl/en/topics/population/population/population-size-and-structure-and-vital-statistics-in-poland-by-territorial-divison-in-2019-as-of-30th-june,3,26.html',
 'stat.gov.pl/broker/access/prefile/listPreFiles.jspa',
 'stat.gov.pl/broker/access/prefile/listPreFiles.jspa']

### 27. hollywoodreporter.com

URLs with *article_display.jsp* can not be transformed.

In [89]:
list(df.loc[df['domain'] == 'hollywoodreporter.com', 'URL_n'])[0:5]

['hollywoodreporter.com/hr/content_display/news/e3i06056b3e43453484224e31d8724c2a46',
 'hollywoodreporter.com/news/syfy-produce-killjoys-drama-canadas-700145',
 'hollywoodreporter.com/heat-vision/jungle-book-2-works-jon-882624',
 'hollywoodreporter.com/heat-vision/mulan-disney-casts-yoson-an-as-love-interest-live-action-movie-1116949',
 'hollywoodreporter.com/behind-screen/vfx-house-milk-raises-25-million-names-ivan-dunleavy-chairman-1014135']

In [90]:
df.loc[df['domain'] == 'hollywoodreporter.com', 'URL_n'] = [re.sub('([0-9a-zA-Z-]{4})(\?(mobile_redirect|facebook).*)', r'\1', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'hollywoodreporter.com', 'URL_n']]

In [91]:
list(df.loc[df['domain'] == 'hollywoodreporter.com', 'URL_n'])[0:5]

['hollywoodreporter.com/hr/content_display/news/e3i06056b3e43453484224e31d8724c2a46',
 'hollywoodreporter.com/news/syfy-produce-killjoys-drama-canadas-700145',
 'hollywoodreporter.com/heat-vision/jungle-book-2-works-jon-882624',
 'hollywoodreporter.com/heat-vision/mulan-disney-casts-yoson-an-as-love-interest-live-action-movie-1116949',
 'hollywoodreporter.com/behind-screen/vfx-house-milk-raises-25-million-names-ivan-dunleavy-chairman-1014135']

### 28. twitter.com

Some parameters are removed.

In [92]:
list(df.loc[df['domain'] == 'twitter.com', 'URL_n'])[0:5]

['twitter.com/braintreetownfc/status/1183642849692278785',
 'twitter.com/CrayValleyPM/status/1028652342479085568',
 'twitter.com/RoxannWhitebean/status/819790339581972481',
 'twitter.com/SpringHillEnt/status/1042488739832913920',
 'twitter.com/WORLDMUSICAWARD/status/987705869944217601']

In [93]:
df.loc[df['domain'] == 'twitter.com', 'URL_n'] = [re.sub('\?.*q=', '?q=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'twitter.com', 'URL_n']]
df.loc[df['domain'] == 'twitter.com', 'URL_n'] = [re.sub('\?(lang|ref|s=|p=|tw_e=|dec).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'twitter.com', 'URL_n']]
df.loc[df['domain'] == 'twitter.com', 'URL_n'] = [re.sub('\.html$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'twitter.com', 'URL_n']]

In [94]:
list(df.loc[df['domain'] == 'twitter.com', 'URL_n'])[0:5]

['twitter.com/braintreetownfc/status/1183642849692278785',
 'twitter.com/CrayValleyPM/status/1028652342479085568',
 'twitter.com/RoxannWhitebean/status/819790339581972481',
 'twitter.com/SpringHillEnt/status/1042488739832913920',
 'twitter.com/WORLDMUSICAWARD/status/987705869944217601']

### 29. deadline.com

Some parameters are removed.

In [95]:
list(df.loc[df['domain'] == 'deadline.com', 'URL_n'])[0:5]

['deadline.com/2013/12/legendary-acquires-asylum-entertainment',
 'deadline.com/2010/05/full-series-rankings-for-the-2009-10-broadcast-season-44277',
 'deadline.com/2016/12/fx-buys-autobiographical-comedy-robert-kelly-denis-leary-1201863758',
 'deadline.com/2017/06/lilly-singh-fahrenheit-451-cast-hbo-movie-1202114653',
 'deadline.com/2017/10/naomi-watts-the-wolf-hour-movie-alistair-banks-griffin-1202192036']

In [96]:
df.loc[df['domain'] == 'deadline.com', 'URL_n'] = [re.sub('(/?)\?(iframe|_escaped_fragment).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'deadline.com', 'URL_n']]

In [97]:
list(df.loc[df['domain'] == 'deadline.com', 'URL_n'])[0:5]

['deadline.com/2013/12/legendary-acquires-asylum-entertainment',
 'deadline.com/2010/05/full-series-rankings-for-the-2009-10-broadcast-season-44277',
 'deadline.com/2016/12/fx-buys-autobiographical-comedy-robert-kelly-denis-leary-1201863758',
 'deadline.com/2017/06/lilly-singh-fahrenheit-451-cast-hbo-movie-1202114653',
 'deadline.com/2017/10/naomi-watts-the-wolf-hour-movie-alistair-banks-griffin-1202192036']

### 30. animenewsnetwork.com

No changes required.

In [98]:
list(df.loc[df['domain'] == 'animenewsnetwork.com', 'URL_n'])[0:5]

['animenewsnetwork.com/review/witchblade/dvd-1',
 'animenewsnetwork.com/news/2002-04-19/media-blasters-acquires-babel-ii',
 'animenewsnetwork.com/news/2013-06-20/genshiken/2nd-season-manga-to-bundle-original-anime-dvd',
 'animenewsnetwork.com/news/2014-09-25/assassination-classroom-tv-anime-casts-jun-fukuyama-tomokazu-sugita-shizuka-itou/.79189',
 'animenewsnetwork.com/news/2015-05-10/undefeated-bahamut-chronicle-light-novel-series-gets-tv-anime/.88013']

### 31. reuters.com

Here is more complex to identify the key parameters. 

In [99]:
list(df.loc[df['domain'] == 'reuters.com', 'URL_n'])[0:5]

['reuters.com/article/2014/04/24/us-thailand-protest-idUSBREA3N0BJ20140424',
 'reuters.com/article/ousiv/idUSTRE57R3B920090828',
 'reuters.com/article/newsOne/idUSTRE50C5JX20090113',
 'reuters.com/article/us-apple-ipad/apple-ipad-reaches-1-million-sales-faster-than-iphone-idUSTRE64002T20100503',
 'reuters.com/article/2014/09/20/us-hongkong-china-idUSKBN0HF0MR20140920']

In [100]:
df.loc[df['domain'] == 'reuters.com', 'URL_n'] = [re.sub('(irpc=|sp=|rpc=|feedtype=|feedname=|il=|locale=|type=|virtualbrandchannel=|view=).*?(&|$)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'reuters.com', 'URL_n']]
df.loc[df['domain'] == 'reuters.com', 'URL_n'] = [re.sub('(#[a-zA-Z0-9]+\.[0-9]{2}$|#targetText.*)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'reuters.com', 'URL_n']]
df.loc[df['domain'] == 'reuters.com', 'URL_n'] = [re.sub('(\?$|\?&$|&$)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'reuters.com', 'URL_n']]

In [101]:
list(df.loc[df['domain'] == 'reuters.com', 'URL_n'])[0:5]

['reuters.com/article/2014/04/24/us-thailand-protest-idUSBREA3N0BJ20140424',
 'reuters.com/article/ousiv/idUSTRE57R3B920090828',
 'reuters.com/article/newsOne/idUSTRE50C5JX20090113',
 'reuters.com/article/us-apple-ipad/apple-ipad-reaches-1-million-sales-faster-than-iphone-idUSTRE64002T20100503',
 'reuters.com/article/2014/09/20/us-hongkong-china-idUSKBN0HF0MR20140920']

### 32. thehindu.com

Some parameters are removed.

In [102]:
list(df.loc[df['domain'] == 'thehindu.com', 'URL_n'])[0:5]

['thehindu.com/entertainment/music/living-his-dream/article19209189.ece',
 'thehindu.com/news/cities/Kochi/when-kochi-developed-these-banking-institutions-stood-as-pillars-of-support/article4526767.ece',
 'thehindu.com/sci-tech/health/breakthrough-dentistry/article5076022.ece',
 'thehindu.com/todays-paper/tp-features/tp-cinemaplus/why-i-like-server-sundaram/article3021498.ece',
 'thehindu.com/features/cinema/wellserved/article5512081.ece']

In [103]:
df.loc[df['domain'] == 'thehindu.com', 'URL_n'] = [re.sub('\?(homepage|css|sec|test|ref|_escaped_fragment_).*#', '#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'thehindu.com', 'URL_n']]
df.loc[df['domain'] == 'thehindu.com', 'URL_n'] = [re.sub('\?(homepage|css|sec|test|ref|_escaped_fragment_).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'thehindu.com', 'URL_n']]

In [104]:
list(df.loc[df['domain'] == 'thehindu.com', 'URL_n'])[0:5]

['thehindu.com/entertainment/music/living-his-dream/article19209189.ece',
 'thehindu.com/news/cities/Kochi/when-kochi-developed-these-banking-institutions-stood-as-pillars-of-support/article4526767.ece',
 'thehindu.com/sci-tech/health/breakthrough-dentistry/article5076022.ece',
 'thehindu.com/todays-paper/tp-features/tp-cinemaplus/why-i-like-server-sundaram/article3021498.ece',
 'thehindu.com/features/cinema/wellserved/article5512081.ece']

### 33. cricketarchive.com

A very few URLs include parameters, but it appears be the same.

In [105]:
list(df.loc[df['domain'] == 'cricketarchive.com', 'URL_n'])[0:5]

['cricketarchive.com/Archive/Scorecards/21/21454.html',
 'cricketarchive.com/Archive/Scorecards/15/15483.html',
 'cricketarchive.com/Archive/Articles/0/792.html',
 'cricketarchive.com/Archive/Scorecards/14/14110.html',
 'cricketarchive.com/Archive/Players/1350/1350473/1350473.html']

### 34. articles.latimes.com

No changes required.

In [106]:
list(df.loc[df['domain'] == 'articles.latimes.com', 'URL_n'])[0:5]

['articles.latimes.com/1997-02-19/food/fo-30082_1_italian-wines',
 'articles.latimes.com/2000/may/02/news/mn-25793',
 'articles.latimes.com/1989-04-30/news/mn-2867_1_fsx-japan-s-ambassador-nobuo-matsunaga-japan-s-mitsubishi-heavy-industries',
 'articles.latimes.com/1996-03-13/entertainment/ca-46373_1_swift-justice',
 'articles.latimes.com/1996-11-26/sports/sp-2970_1_pitt-coach']

### 35. discogs.com

The main parameters are anv, filter_anv, noanv (these are used to link with alternative names) and release.

In [107]:
list(df.loc[df['domain'] == 'discogs.com', 'URL_n'])[0:5]

['discogs.com/Mariah-Carey-All-I-Want-For-Christmas-Is-You/release/2504461',
 'discogs.com/Elton-John-Captain-Fantastic-And-The-Brown-Dirt-Cowboy/release/2175776',
 'discogs.com/Ilaiyaraaja-Raja-Paarvai/release/10735362',
 'discogs.com/release/10776167-American-Dream/images',
 'discogs.com/Hooton-Tennis-Club-Kathleen-Sat-On-The-Arm-Of-Her-Favourite-Chair/release/6920006']

In [108]:
df.loc[df['domain'] == 'discogs.com', 'URL_n'] = [re.sub('(layout=).*?(&|$)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'discogs.com', 'URL_n']]
df.loc[df['domain'] == 'discogs.com', 'URL_n'] = [re.sub('(\?$|\?&$|&$)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'discogs.com', 'URL_n']]

In [109]:
list(df.loc[df['domain'] == 'discogs.com', 'URL_n'])[0:5]

['discogs.com/Mariah-Carey-All-I-Want-For-Christmas-Is-You/release/2504461',
 'discogs.com/Elton-John-Captain-Fantastic-And-The-Brown-Dirt-Cowboy/release/2175776',
 'discogs.com/Ilaiyaraaja-Raja-Paarvai/release/10735362',
 'discogs.com/release/10776167-American-Dream/images',
 'discogs.com/Hooton-Tennis-Club-Kathleen-Sat-On-The-Arm-Of-Her-Favourite-Chair/release/6920006']

### 36. officialcharts.com

No changes required.

In [110]:
list(df.loc[df['domain'] == 'officialcharts.com', 'URL_n'])[0:5]

['officialcharts.com/artists',
 'officialcharts.com/charts/albums-chart/19780702/7502',
 'officialcharts.com/artist/16720/buzzcocks',
 'officialcharts.com/search/singles/going%20down%20to%20liverpool',
 'officialcharts.com/charts/physical-singles-chart/20100418/1']

### 37. metacritic.com

There are several parameters (filter, q, sort, page, ref, ftag, part, tag...) but only a few can be removed.

In [111]:
list(df.loc[df['domain'] == 'metacritic.com', 'URL_n'])[0:5]

['metacritic.com/movie/x-men-origins-wolverine',
 'metacritic.com/game/3ds/brain-age-concentration-training',
 'metacritic.com/game/pc/warcraft-iii-reign-of-chaos',
 'metacritic.com/game/pc/doki-doki-literature-club!',
 'metacritic.com/movie/miss-americana']

In [112]:
df.loc[df['domain'] == 'metacritic.com', 'URL_n'] = [re.sub('\?q=.*#', '#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'metacritic.com', 'URL_n']]
df.loc[df['domain'] == 'metacritic.com', 'URL_n'] = [re.sub('\?(q=|part=).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'metacritic.com', 'URL_n']]

In [113]:
list(df.loc[df['domain'] == 'metacritic.com', 'URL_n'])[0:5]

['metacritic.com/movie/x-men-origins-wolverine',
 'metacritic.com/game/3ds/brain-age-concentration-training',
 'metacritic.com/game/pc/warcraft-iii-reign-of-chaos',
 'metacritic.com/game/pc/doki-doki-literature-club!',
 'metacritic.com/movie/miss-americana']

### 38. abc.net.au

Some parameters are removed.

In [114]:
list(df.loc[df['domain'] == 'abc.net.au', 'URL_n'])[0:5]

["abc.net.au/news/2016-08-30/protesters-arrested-in-kalgoorlie-riot-after-teen's-death/7797804",
 'abc.net.au/lateline/interview-mark-tedeschi-qc,-new-south-wales/6906896',
 'abc.net.au/news/2011-06-10/spinnaker-island-weed-burn/2754288',
 'abc.net.au/news/2012-08-23/government-announces-increase-in-humanitarian-intake/4217962',
 "abc.net.au/news/2016-09-01/plans-for-new-suburb-in-melbourne's-inner-north-unveiled/7804530"]

In [115]:
df.loc[df['domain'] == 'abc.net.au', 'URL_n'] = [re.sub('\.htm\?(site|section).*#', '.htm#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'abc.net.au', 'URL_n']]
df.loc[df['domain'] == 'abc.net.au', 'URL_n'] = [re.sub('\.htm\?(site|section).*', '.htm', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'abc.net.au', 'URL_n']]
df.loc[df['domain'] == 'abc.net.au', 'URL_n'] = [re.sub('(/\?|\?)(site|section).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'abc.net.au', 'URL_n']]

In [116]:
list(df.loc[df['domain'] == 'abc.net.au', 'URL_n'])[0:5]

["abc.net.au/news/2016-08-30/protesters-arrested-in-kalgoorlie-riot-after-teen's-death/7797804",
 'abc.net.au/lateline/interview-mark-tedeschi-qc,-new-south-wales/6906896',
 'abc.net.au/news/2011-06-10/spinnaker-island-weed-burn/2754288',
 'abc.net.au/news/2012-08-23/government-announces-increase-in-humanitarian-intake/4217962',
 "abc.net.au/news/2016-09-01/plans-for-new-suburb-in-melbourne's-inner-north-unveiled/7804530"]

### 39. books.google.co.uk

These URLs require a lot of transformations to be reduced.

In [117]:
list(df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'])[0:5]

['books.google.co.uk/books?id=Qn6vBwAAQBAJ&pg=PA116#v%3Donepage%26q%26f%3Dfalse',
 'books.google.co.uk/books?id=WSLUAwAAQBAJ&pg=PA5',
 'books.google.co.uk/books?id=Pv4BuiVPe_YC&pg=PA31',
 'books.google.co.uk/books?id=TsrT1et_iCAC&pg=PA133',
 'books.google.co.uk/books?id=b_kTU8LfcqAC&lpg=PA335&pg=PA335']

In [118]:
df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'] = [re.sub('\?.*&id=', '?id=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.co.uk', 'URL_n']]
df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'] = [re.sub('(\?id=.*?&)(.*)', r'\1', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.co.uk', 'URL_n']]
df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'] = [re.sub('\?.*&q=', '?q=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.co.uk', 'URL_n']]
df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'] = [re.sub('ngrams/graph\?.*&content=', 'ngrams/graph?content=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.co.uk', 'URL_n']]
df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'] = [re.sub('&.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.co.uk', 'URL_n']]
df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'] = [re.sub('/books.*\?id=', '/?id=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.co.uk', 'URL_n']]
df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'] = [re.sub('/books.*\?vid=', '/?vid=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.co.uk', 'URL_n']]
df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'] = [re.sub('/books.*\?q=', '/?q=', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.co.uk', 'URL_n']]
df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'] = [re.sub('#(v|search).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'books.google.co.uk', 'URL_n']]

In [119]:
list(df.loc[df['domain'] == 'books.google.co.uk', 'URL_n'])[0:5]

['books.google.co.uk/?id=Qn6vBwAAQBAJ',
 'books.google.co.uk/?id=WSLUAwAAQBAJ',
 'books.google.co.uk/?id=Pv4BuiVPe_YC',
 'books.google.co.uk/?id=TsrT1et_iCAC',
 'books.google.co.uk/?id=b_kTU8LfcqAC']

### 40. facebook.com

Some parameters are removed.

In [120]:
list(df.loc[df['domain'] == 'facebook.com', 'URL_n'])[0:5]

['facebook.com/NowThisPolitics/videos/2058599324171547/UzpfSTEwMDAxNjUyOTIwNTg3MzoyNTA1Njc2Nzg4Mzc1Mzc/?id=100016529205873',
 'facebook.com/teamarmstrongcurling/posts/1061917074201489?__tn__=-R',
 'facebook.com/groups/FriendsofMike',
 'facebook.com/themeparkarchive/photos/a.755965204471431.1073741829.754391131295505/1159264867474794/?theater&type=1',
 'facebook.com/fcpafos/photos/a.386976711496854.1073741829.386700621524463/850246115169909/?theater&type=3']

In [121]:
df.loc[df['domain'] == 'facebook.com', 'URL_n'] = [re.sub('\?(theater|theatre|stream_).*?(&|$)', '?', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'facebook.com', 'URL_n']]
df.loc[df['domain'] == 'facebook.com', 'URL_n'] = [re.sub('(__tn__=|pnref=|fref=|type=|notif_t=|permpage=|total_comments=).*?(&|$)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'facebook.com', 'URL_n']]
df.loc[df['domain'] == 'facebook.com', 'URL_n'] = [re.sub('(\?$|\?&$|&$)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'facebook.com', 'URL_n']]

In [122]:
list(df.loc[df['domain'] == 'facebook.com', 'URL_n'])[0:5]

['facebook.com/NowThisPolitics/videos/2058599324171547/UzpfSTEwMDAxNjUyOTIwNTg3MzoyNTA1Njc2Nzg4Mzc1Mzc/?id=100016529205873',
 'facebook.com/teamarmstrongcurling/posts/1061917074201489',
 'facebook.com/groups/FriendsofMike',
 'facebook.com/themeparkarchive/photos/a.755965204471431.1073741829.754391131295505/1159264867474794/',
 'facebook.com/fcpafos/photos/a.386976711496854.1073741829.386700621524463/850246115169909/']

### 41. cbc.ca

Some parameters are removed.

In [123]:
list(df.loc[df['domain'] == 'cbc.ca', 'URL_n'])[0:5]

['cbc.ca/news/canada/montreal/montreal-street-to-be-named-daisy-peterson-sweeney-1.4250647',
 'cbc.ca/sports/soccer/whitecaps-fire-head-coach-martin-rennie-1.2274991',
 'cbc.ca/news/canada/saskatchewan/moe-federal-election-1.5282354?__vfz=medium=sharebar',
 'cbc.ca/news/canada/toronto/line-1-subway-update-1.4861277',
 'cbc.ca/news/canada/story/2011/07/07/f-vp-valpy-young-royals.html']

In [124]:
df.loc[df['domain'] == 'cbc.ca', 'URL_n'] = [re.sub('#ixzz.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'cbc.ca', 'URL_n']]
df.loc[df['domain'] == 'cbc.ca', 'URL_n'] = [re.sub('\.html\?(__vfz=|autoplay=|rss=|cmp=|ref=|print=|r=).*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'cbc.ca', 'URL_n']]
df.loc[df['domain'] == 'cbc.ca', 'URL_n'] = [re.sub('\?(__vfz=|autoplay=|rss=|cmp=|ref=|print=|r=).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'cbc.ca', 'URL_n']]

In [125]:
list(df.loc[df['domain'] == 'cbc.ca', 'URL_n'])[0:5]

['cbc.ca/news/canada/montreal/montreal-street-to-be-named-daisy-peterson-sweeney-1.4250647',
 'cbc.ca/sports/soccer/whitecaps-fire-head-coach-martin-rennie-1.2274991',
 'cbc.ca/news/canada/saskatchewan/moe-federal-election-1.5282354',
 'cbc.ca/news/canada/toronto/line-1-subway-update-1.4861277',
 'cbc.ca/news/canada/story/2011/07/07/f-vp-valpy-young-royals.html']

### 42. amazon.com

ref= and keywords= can be removed not in all cases.

In [126]:
list(df.loc[df['domain'] == 'amazon.com', 'URL_n'])[0:5]

['amazon.com/Sympathy-for-the-Devil-HD/dp/B003XI83H8',
 'amazon.com/Family-Business-Carl-Weber/dp/1601627092/ref=la_B001IQZOJ2_1_2?ie=UTF8&qid=1456022103&s=books&sr=1-2',
 'amazon.com/Chickenshit-Club-Department-Prosecute-Executives/dp/1501121367',
 'amazon.com/Crapitalism-Liberals-Millions-Swiping-Dollars/dp/1476750424/ref=sr_1_1?ie=UTF8&keywords=crapitalism+liberals+who+make+millions+swiping+your+tax+dollars&qid=1505862438&s=books&sr=1-1',
 'amazon.com/Spook-Mary-Roach-Sep-2006/dp/B00D820F2A/ref=sr_1_2?ie=UTF8&keywords=spook+roach&qid=1533166658&s=books&sr=1-2']

In [127]:
df.loc[df['domain'] == 'amazon.com', 'URL_n'] = [re.sub('(\?ie=|\?_encoding=).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'amazon.com', 'URL_n']]
df.loc[df['domain'] == 'amazon.com', 'URL_n'] = [re.sub('([0-9a-z]{2})(/ref=.*)', r'\1', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'amazon.com', 'URL_n']]

In [128]:
list(df.loc[df['domain'] == 'amazon.com', 'URL_n'])[0:5]

['amazon.com/Sympathy-for-the-Devil-HD/dp/B003XI83H8',
 'amazon.com/Family-Business-Carl-Weber/dp/1601627092',
 'amazon.com/Chickenshit-Club-Department-Prosecute-Executives/dp/1501121367',
 'amazon.com/Crapitalism-Liberals-Millions-Swiping-Dollars/dp/1476750424',
 'amazon.com/Spook-Mary-Roach-Sep-2006/dp/B00D820F2A']

### 43. espn.com

No changes required.

In [129]:
list(df.loc[df['domain'] == 'espn.com', 'URL_n'])[0:5]

['espn.com/blog/nflnation/post/_/id/220441/raiders-prove-they-are-the-real-deal-with-thumping-of-broncos',
 'espn.com/golf/story/_/id/22996264/drive-chip-putt-winners-crowned-augusta-national-ahead-masters',
 'espn.com/mens-college-basketball/story/_/id/15160931/georgia-tech-yellow-jackets-hire-josh-pastner-head-coach',
 'espn.com/mlb/story/_/id/25938532/reliever-brad-boxberger-agrees-deal-kansas-city-royals',
 'espn.com/nba/playoffs/2016/story/_/id/16351029/lebron-james-cleveland-cavaliers-named-unanimous-nba-finals-mvp']

### 44. latimes.com

Some parameters are removed.

In [130]:
list(df.loc[df['domain'] == 'latimes.com', 'URL_n'])[0:5]

['latimes.com/local/lanow/la-me-ln-kansas-swatting-records-20180126-story.html',
 'latimes.com/world/asia/la-fg-japan-abe-wins-downbeat-election-20141214-story.html#page%3D1',
 'latimes.com/entertainment/envelope/tv/la-et-st-emmy-awards-2015-list-nominees-winners-story.html',
 'latimes.com/local/obituaries/la-me-foch7-2008dec07-story.html',
 'latimes.com/news/nationworld/world/la-fg-israel-katsav-20101231,0,4453115.story']

In [131]:
df.loc[df['domain'] == 'latimes.com', 'URL_n'] = [re.sub('(#axzz|#ixzz).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'latimes.com', 'URL_n']]
df.loc[df['domain'] == 'latimes.com', 'URL_n'] = [re.sub('\.html\?(col|track|cset|_amp|outputType|barc|dlvrit).*#pag', '.html#pag', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'latimes.com', 'URL_n']]
df.loc[df['domain'] == 'latimes.com', 'URL_n'] = [re.sub('\.html\?(col|track|cset|_amp|outputType|barc|dlvrit).*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'latimes.com', 'URL_n']]
df.loc[df['domain'] == 'latimes.com', 'URL_n'] = [re.sub('(\?|\.html\?)(col|track|cset|_amp|outputType|barc|dlvrit).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'latimes.com', 'URL_n']]

In [132]:
list(df.loc[df['domain'] == 'latimes.com', 'URL_n'])[0:5]

['latimes.com/local/lanow/la-me-ln-kansas-swatting-records-20180126-story.html',
 'latimes.com/world/asia/la-fg-japan-abe-wins-downbeat-election-20141214-story.html#page%3D1',
 'latimes.com/entertainment/envelope/tv/la-et-st-emmy-awards-2015-list-nominees-winners-story.html',
 'latimes.com/local/obituaries/la-me-foch7-2008dec07-story.html',
 'latimes.com/news/nationworld/world/la-fg-israel-katsav-20101231,0,4453115.story']

### 45. usatoday.com 

Some parameters are removed.

In [133]:
list(df.loc[df['domain'] == 'usatoday.com', 'URL_n'])[0:5]

['usatoday.com/story/money/cars/2014/08/08/gm-victims-fund-death-claims/13792833',
 'usatoday.com/sports/soccer/2004-11-11-asked-harkes_x.htm',
 'usatoday.com/story/sports/nfl/redskins/2014/09/14/robert-griffin-iii-rg3-rgiii-left-ankle-cast-jaguars-washington/15629039',
 'usatoday.com/news/washington/2011-01-12-defense12_ST_N.htm',
 'usatoday.com/sports/olympics/2009-06-12-1009615793_x.htm']

In [134]:
df.loc[df['domain'] == 'usatoday.com', 'URL_n'] = [re.sub('\.htm\?(POE=|loc|csp=|dlvrit=).*#', '.htm#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'usatoday.com', 'URL_n']]
df.loc[df['domain'] == 'usatoday.com', 'URL_n'] = [re.sub('\.htm\?(POE=|loc|csp=|dlvrit=).*', '.htm', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'usatoday.com', 'URL_n']]
df.loc[df['domain'] == 'usatoday.com', 'URL_n'] = [re.sub('(/\?|/1\?)(POE=|loc|csp=|dlvrit=).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'usatoday.com', 'URL_n']]

In [135]:
list(df.loc[df['domain'] == 'usatoday.com', 'URL_n'])[0:5]

['usatoday.com/story/money/cars/2014/08/08/gm-victims-fund-death-claims/13792833',
 'usatoday.com/sports/soccer/2004-11-11-asked-harkes_x.htm',
 'usatoday.com/story/sports/nfl/redskins/2014/09/14/robert-griffin-iii-rg3-rgiii-left-ankle-cast-jaguars-washington/15629039',
 'usatoday.com/news/washington/2011-01-12-defense12_ST_N.htm',
 'usatoday.com/sports/olympics/2009-06-12-1009615793_x.htm']

### 46. rollingstone.com

Similar to hollywoodreporter.

In [136]:
list(df.loc[df['domain'] == 'rollingstone.com', 'URL_n'])[0:5]

['rollingstone.com/news/story/5938755/the_vines/print',
 'rollingstone.com/music/albumreviews/carly-simon-19710401',
 'rollingstone.com/music/albumreviews/the-open-door-20061005',
 'rollingstone.com/music/features/how-weezers-pinkerton-went-from-embarrassing-to-essential-w441144',
 'rollingstone.com/music/features/the-rolling-stone-interview-keith-richard-20150123']

In [137]:
df.loc[df['domain'] == 'rollingstone.com', 'URL_n'] = [re.sub('#ixzz.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'rollingstone.com', 'URL_n']]
df.loc[df['domain'] == 'rollingstone.com', 'URL_n'] = [re.sub('\?source.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'rollingstone.com', 'URL_n']]

In [138]:
list(df.loc[df['domain'] == 'rollingstone.com', 'URL_n'])[0:5]

['rollingstone.com/news/story/5938755/the_vines/print',
 'rollingstone.com/music/albumreviews/carly-simon-19710401',
 'rollingstone.com/music/albumreviews/the-open-door-20061005',
 'rollingstone.com/music/features/how-weezers-pinkerton-went-from-embarrassing-to-essential-w441144',
 'rollingstone.com/music/features/the-rolling-stone-interview-keith-richard-20150123']

### 47. smh.com.au

Some parameters are removed.

In [139]:
list(df.loc[df['domain'] == 'smh.com.au', 'URL_n'])[0:5]

['smh.com.au/small-business/startup/making-sweet-sweet-music-20130711-2pt59.html',
 'smh.com.au/world/denmarks-princess-alexandra-to-remarry-20070208-gdpfh4.html',
 'smh.com.au/entertainment/music/national-treasure-made-music-until-the-end-20100715-10clx.html',
 'smh.com.au/environment/whale-watch/another-ocean-giant-meets-a-tragic-end-20120208-1rkfj.html',
 'smh.com.au/national/a-star-is-forlorn-ageing-theatre-queen-cries-out-for-a-little-makeup-20080808-3sem.html']

In [140]:
df.loc[df['domain'] == 'smh.com.au', 'URL_n'] = [re.sub('#ixzz.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'smh.com.au', 'URL_n']]
df.loc[df['domain'] == 'smh.com.au', 'URL_n'] = [re.sub('\.html\?(from=|skin=|oneclick=|s_cid=|autostart=|rand=|feed=|fbclid=|deviceType=|ref=).*#', '.html#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'smh.com.au', 'URL_n']]
df.loc[df['domain'] == 'smh.com.au', 'URL_n'] = [re.sub('\.html\?(from=|skin=|oneclick=|s_cid=|autostart=|rand=|feed=|fbclid=|deviceType=|ref=).*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'smh.com.au', 'URL_n']]

In [141]:
list(df.loc[df['domain'] == 'smh.com.au', 'URL_n'])[0:5]

['smh.com.au/small-business/startup/making-sweet-sweet-music-20130711-2pt59.html',
 'smh.com.au/world/denmarks-princess-alexandra-to-remarry-20070208-gdpfh4.html',
 'smh.com.au/entertainment/music/national-treasure-made-music-until-the-end-20100715-10clx.html',
 'smh.com.au/environment/whale-watch/another-ocean-giant-meets-a-tragic-end-20120208-1rkfj.html',
 'smh.com.au/national/a-star-is-forlorn-ageing-theatre-queen-cries-out-for-a-little-makeup-20080808-3sem.html']

### 48. int.soccerway.com

One parameter is removed.

In [142]:
list(df.loc[df['domain'] == 'int.soccerway.com', 'URL_n'])[0:5]

['int.soccerway.com/matches/2016/09/03/england/conference-n--s/boston-united-fc/fc-united-of-manchester/2264825',
 'int.soccerway.com/matches/2016/11/09/england/football-league-trophy/carlisle-united-fc/fleetwood-town-fc/2319442',
 'int.soccerway.com/players/frazer-shaw/412906',
 'int.soccerway.com/matches/2010/03/20/england/league-two/hereford-united-fc/bradford-city-afc/783838',
 'int.soccerway.com/players/stefan-mols/454421']

In [143]:
df.loc[df['domain'] == 'int.soccerway.com', 'URL_n'] = [re.sub('/\?icid=.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'int.soccerway.com', 'URL_n']]

In [144]:
list(df.loc[df['domain'] == 'int.soccerway.com', 'URL_n'])[0:5]

['int.soccerway.com/matches/2016/09/03/england/conference-n--s/boston-united-fc/fc-united-of-manchester/2264825',
 'int.soccerway.com/matches/2016/11/09/england/football-league-trophy/carlisle-united-fc/fleetwood-town-fc/2319442',
 'int.soccerway.com/players/frazer-shaw/412906',
 'int.soccerway.com/matches/2010/03/20/england/league-two/hereford-united-fc/bradford-city-afc/783838',
 'int.soccerway.com/players/stefan-mols/454421']

### 49. forbes.com

Some paremeters such as list= can be removed but the page changes.

In [145]:
list(df.loc[df['domain'] == 'forbes.com', 'URL_n'])[0:5]

['forbes.com/sites/halahtouryalai/2011/10/12/volcker-rule-is-out-how-much-will-it-hurt',
 'forbes.com/enterprisetech/2004/08/04/cz_dl_0804sco.html',
 'forbes.com/sites/matthewnewton/2011/09/08/hypebeast-founder-talks-influencers-and-authenticity/#45acafe87f69',
 'forbes.com/sites/andygreenberg/2011/02/11/palantir-apologizes-for-wikileaks-attack-proposal-cuts-ties-with-hbgary',
 'forbes.com/2008/02/28/most-expensive-watches-forbeslife-time08-cx_nr_0229watch_slide.html']

In [146]:
df.loc[df['domain'] == 'forbes.com', 'URL_n'] = [re.sub('(/*)#[a-z0-9]{12}.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'forbes.com', 'URL_n']]
df.loc[df['domain'] == 'forbes.com', 'URL_n'] = [re.sub('\.html\?(thisSpeed=|c=|boxes=).*#', '.html#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'forbes.com', 'URL_n']]
df.loc[df['domain'] == 'forbes.com', 'URL_n'] = [re.sub('\.html\?(thisSpeed=|c=|boxes=).*', '.html', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'forbes.com', 'URL_n']]
df.loc[df['domain'] == 'forbes.com', 'URL_n'] = [re.sub('(/*)\?(thisSpeed=|c=|boxes=).*#', '#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'forbes.com', 'URL_n']]
df.loc[df['domain'] == 'forbes.com', 'URL_n'] = [re.sub('(/*)\?(thisSpeed=|c=|boxes=).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'forbes.com', 'URL_n']]

In [147]:
list(df.loc[df['domain'] == 'forbes.com', 'URL_n'])[0:5]

['forbes.com/sites/halahtouryalai/2011/10/12/volcker-rule-is-out-how-much-will-it-hurt',
 'forbes.com/enterprisetech/2004/08/04/cz_dl_0804sco.html',
 'forbes.com/sites/matthewnewton/2011/09/08/hypebeast-founder-talks-influencers-and-authenticity',
 'forbes.com/sites/andygreenberg/2011/02/11/palantir-apologizes-for-wikileaks-attack-proposal-cuts-ties-with-hbgary',
 'forbes.com/2008/02/28/most-expensive-watches-forbeslife-time08-cx_nr_0229watch_slide.html']

### 50. cnn.com

Some parameters are removed.

In [148]:
list(df.loc[df['domain'] == 'cnn.com', 'URL_n'])[0:5]

['cnn.com/SHOWBIZ/9605/29/imposssibles',
 'cnn.com/videos/tv/2019/03/02/exp-gps-0303-kroenig-on-trumps-approach.cnn',
 'cnn.com/2004/LAW/10/26/malvo.plea/index.html',
 'cnn.com/2017/06/16/politics/john-dowd-lawyer-donald-trump/index.html',
 'cnn.com/2015/08/11/health/trumps-comb-over/index.html']

In [149]:
df.loc[df['domain'] == 'cnn.com', 'URL_n'] = [re.sub('(\?|&)hpt=[a-z0-9_]+$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'cnn.com', 'URL_n']]
df.loc[df['domain'] == 'cnn.com', 'URL_n'] = [re.sub('(\?|&)(eref|iref|section|_S)=.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'cnn.com', 'URL_n']]

In [150]:
list(df.loc[df['domain'] == 'cnn.com', 'URL_n'])[0:5]

['cnn.com/SHOWBIZ/9605/29/imposssibles',
 'cnn.com/videos/tv/2019/03/02/exp-gps-0303-kroenig-on-trumps-approach.cnn',
 'cnn.com/2004/LAW/10/26/malvo.plea/index.html',
 'cnn.com/2017/06/16/politics/john-dowd-lawyer-donald-trump/index.html',
 'cnn.com/2015/08/11/health/trumps-comb-over/index.html']

### 51. wikidata.org

Minor changes related to the language.

In [151]:
list(df.loc[df['domain'] == 'wikidata.org', 'URL_n'])[0:5]

['wikidata.org/wiki/Q63848025',
 'wikidata.org/wiki/Help:Sources#Scientific%2C_newspaper_or_magazine_article',
 'wikidata.org/wiki/Q3554335',
 'wikidata.org/wiki/Property:P2026',
 'wikidata.org/wiki/Q54899015']

In [152]:
df.loc[df['domain'] == 'wikidata.org', 'URL_n'] = [re.sub('\?uselang=.*#', '#', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'wikidata.org', 'URL_n']]
df.loc[df['domain'] == 'wikidata.org', 'URL_n'] = [re.sub('\?uselang=.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'wikidata.org', 'URL_n']]

In [153]:
list(df.loc[df['domain'] == 'wikidata.org', 'URL_n'])[0:5]

['wikidata.org/wiki/Q63848025',
 'wikidata.org/wiki/Help:Sources#Scientific%2C_newspaper_or_magazine_article',
 'wikidata.org/wiki/Q3554335',
 'wikidata.org/wiki/Property:P2026',
 'wikidata.org/wiki/Q54899015']

### 52. worldcat.org

Some tags are removed.

In [154]:
list(df[df['domain'] == 'worldcat.org']['URL_n'])[0:5]

['worldcat.org/oclc/40856843',
 'worldcat.org/title/suma-povrs-i-zupci-u-hercegovini/oclc/28403830',
 'worldcat.org/title/regensburger-domorganisten-zum-150-todestag-von-carl-proske-1794-1861-und-zum-80-geburtstag-von-eberhard-kraus-1931-2003-ausstellung-in-der-bischoflichen-zentralbibliothek-regensburg-st-petersweg-11-13-20-mai-bis-22-juli-2011/oclc/734051627&referer=brief_results',
 'worldcat.org/oclc/884013323',
 'worldcat.org/oclc/61758538']

In [155]:
df.loc[df['domain'] == 'worldcat.org', 'URL_n'] = [re.sub('(referer=|tab=|ht=|qt=|submit=).*?(&|$)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'worldcat.org', 'URL_n']]
df.loc[df['domain'] == 'worldcat.org', 'URL_n'] = [re.sub('(\?$|\?&+$|&+$)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'worldcat.org', 'URL_n']]

In [156]:
list(df[df['domain'] == 'worldcat.org']['URL_n'])[0:5]

['worldcat.org/oclc/40856843',
 'worldcat.org/title/suma-povrs-i-zupci-u-hercegovini/oclc/28403830',
 'worldcat.org/title/regensburger-domorganisten-zum-150-todestag-von-carl-proske-1794-1861-und-zum-80-geburtstag-von-eberhard-kraus-1931-2003-ausstellung-in-der-bischoflichen-zentralbibliothek-regensburg-st-petersweg-11-13-20-mai-bis-22-juli-2011/oclc/734051627',
 'worldcat.org/oclc/884013323',
 'worldcat.org/oclc/61758538']

### 53. viaf.org

Some tags are removed.

In [157]:
list(df[df['domain'] == 'viaf.org']['URL_n'])[0:5]

['viaf.org/viaf/9770413/#Kim%2C_My%C5%8Fng-ch%CA%BB%C5%8Fl%2C_1944-',
 'viaf.org/viaf/315862217',
 'viaf.org/viaf/14920296',
 'viaf.org/viaf/44363103',
 'viaf.org/viaf/311398419/#Galula-Ericson%2C_Magda_%281929-_%29']

In [158]:
df.loc[df['domain'] == 'viaf.org', 'URL_n'] = [re.sub('&sortkeys=.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'viaf.org', 'URL_n']]
df.loc[df['domain'] == 'viaf.org', 'URL_n'] = [re.sub('/#.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'viaf.org', 'URL_n']]

In [159]:
list(df[df['domain'] == 'viaf.org']['URL_n'])[0:5]

['viaf.org/viaf/9770413',
 'viaf.org/viaf/315862217',
 'viaf.org/viaf/14920296',
 'viaf.org/viaf/44363103',
 'viaf.org/viaf/311398419']

### 54. jstor.org

Some tags are removed.

In [160]:
list(df.loc[df['domain'] == 'jstor.org', 'URL_n'])[0:5]

['jstor.org/stable/j.ctv75db7c',
 'jstor.org/',
 'jstor.org/stable/3661092',
 'jstor.org/stable/pdf/43462546.pdf?seq=1',
 'jstor.org/action/doBasicSearch?Query=au:%22Joseph+D.+Reid%22+&acc=off&fc=off&group=none&wc=on']

In [161]:
df.loc[df['domain'] == 'jstor.org', 'URL_n'] = [re.sub('([0-9])(\?(seq=|sid=|cookieSet=|origin=|ref=).*)', r'\1', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'jstor.org', 'URL_n']]
df.loc[df['domain'] == 'jstor.org', 'URL_n'] = [re.sub('#(page|meta|fndtn).*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'jstor.org', 'URL_n']]

In [162]:
list(df.loc[df['domain'] == 'jstor.org', 'URL_n'])[0:5]

['jstor.org/stable/j.ctv75db7c',
 'jstor.org/',
 'jstor.org/stable/3661092',
 'jstor.org/stable/pdf/43462546.pdf?seq=1',
 'jstor.org/action/doBasicSearch?Query=au:%22Joseph+D.+Reid%22+&acc=off&fc=off&group=none&wc=on']

### 55. id.loc.gov

No changes required.

In [163]:
list(df.loc[df['domain'] == 'id.loc.gov', 'URL_n'])[0:5]

['id.loc.gov/authorities/names/nr97035180.html',
 'id.loc.gov/authorities/names/n79052939.html',
 'id.loc.gov/authorities/names/nr2002015127.html',
 'id.loc.gov/authorities/names/n84198664.html',
 'id.loc.gov/authorities/names/n88030903.html']

### 56. ssd.jpl.nasa.gov

No changes required.

In [164]:
list(df.loc[df['domain'] == 'ssd.jpl.nasa.gov', 'URL_n'])[0:5]

['ssd.jpl.nasa.gov/sbdb.cgi?sstr=610;cad=1',
 'ssd.jpl.nasa.gov/sbdb.cgi?sstr=2015DQ224;cad=1#cad',
 'ssd.jpl.nasa.gov/sbdb.cgi?sstr=yi-SWAN;orb=1;cov=0;log=0;cad=0#orb',
 'ssd.jpl.nasa.gov/sbdb.cgi?sstr=126',
 'ssd.jpl.nasa.gov/sbdb.cgi?sstr=C/2007+K5']

### 57. minorplanetcenter.net

One minor tag removed.

In [165]:
list(df.loc[df['domain'] == 'minorplanetcenter.net', 'URL_n'])[0:5]

['minorplanetcenter.net/db_search/show_object?object_id=52246',
 'minorplanetcenter.net/iau/ECS/MPCArchive/MPCArchive_TBL.html',
 'minorplanetcenter.net/iau/ECS/MPCArchive/MPCArchive_TBL.html',
 'minorplanetcenter.net/db_search/show_object?object_id=51985',
 'minorplanetcenter.net/mpec/K18/K18O11.html']

In [166]:
df.loc[df['domain'] == 'minorplanetcenter.net', 'URL_n'] = [re.sub('commit=show&', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'minorplanetcenter.net', 'URL_n']]

In [167]:
list(df.loc[df['domain'] == 'minorplanetcenter.net', 'URL_n'])[0:5]

['minorplanetcenter.net/db_search/show_object?object_id=52246',
 'minorplanetcenter.net/iau/ECS/MPCArchive/MPCArchive_TBL.html',
 'minorplanetcenter.net/iau/ECS/MPCArchive/MPCArchive_TBL.html',
 'minorplanetcenter.net/db_search/show_object?object_id=51985',
 'minorplanetcenter.net/mpec/K18/K18O11.html']

### 58. isni.org

No changes required.

In [168]:
list(df.loc[df['domain'] == 'isni.org', 'URL_n'])[0:5]

['isni.org/isni/0000000079077342',
 'isni.org/isni/0000000045012479',
 'isni.org/content/isni-members',
 'isni.org/isni/0000000122839155',
 'isni.org/isni_and_orcid']

### 59. ncbi.nlm.nih.gov

One minor tag removed

In [169]:
list(df.loc[df['domain'] == 'ncbi.nlm.nih.gov', 'URL_n'])[0:5]

['ncbi.nlm.nih.gov/books/NBK470280',
 'ncbi.nlm.nih.gov/books/NBK21054',
 'ncbi.nlm.nih.gov/books/NBK201460',
 'ncbi.nlm.nih.gov/pubmed/14602247',
 'ncbi.nlm.nih.gov/pmc/articles/PMC6784102']

In [170]:
df.loc[df['domain'] == 'ncbi.nlm.nih.gov', 'URL_n'] = [re.sub('/\?page.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'ncbi.nlm.nih.gov', 'URL_n']]

In [171]:
list(df.loc[df['domain'] == 'ncbi.nlm.nih.gov', 'URL_n'])[0:5]

['ncbi.nlm.nih.gov/books/NBK470280',
 'ncbi.nlm.nih.gov/books/NBK21054',
 'ncbi.nlm.nih.gov/books/NBK201460',
 'ncbi.nlm.nih.gov/pubmed/14602247',
 'ncbi.nlm.nih.gov/pmc/articles/PMC6784102']

### 60. d-nb.info

No changes required.

In [172]:
list(df.loc[df['domain'] == 'd-nb.info', 'URL_n'])[0:5]

['d-nb.info/97558300X/about/html',
 'd-nb.info/946113483',
 'd-nb.info/gnd/124696945',
 'd-nb.info/1026265320/34',
 'd-nb.info/750425016']

### 61. gbif.org

No changes required.

In [173]:
list(df.loc[df['domain'] == 'gbif.org', 'URL_n'])[0:5]

['gbif.org/species/8964145',
 'gbif.org/species/156101076',
 'gbif.org/species/2118627',
 'gbif.org/species/6003314',
 'gbif.org/species/2020839']

### 62. api.semanticscholar.org

No changes required.

In [174]:
list(df.loc[df['domain'] == 'api.semanticscholar.org', 'URL_n'])[0:5]

['api.semanticscholar.org/CorpusID:33224883',
 'api.semanticscholar.org/CorpusID:1749330',
 'api.semanticscholar.org/CorpusID:120313863']

### 63. musicbrainz.org

No changes required.

In [175]:
list(df.loc[df['domain'] == 'musicbrainz.org', 'URL_n'])[0:5]

['musicbrainz.org/release/dca8e8d9-7ff2-4b0b-9520-52158724e0e0',
 'musicbrainz.org/release/dd1e6496-4112-4196-98d0-5c49dddebb88',
 'musicbrainz.org/work/7acb03e5-c341-41bf-a6e4-86cfa8ee5f06',
 'musicbrainz.org/release/aeb18fcf-8450-4358-8fa7-ba6a909c3ce8.html',
 'musicbrainz.org/release/45d067fe-d523-4e7a-b9f3-79ae8ca9c12b']

### 64. data.bnf.fr

No changes required.

In [176]:
list(df.loc[df['domain'] == 'data.bnf.fr', 'URL_n'])[0:5]

['data.bnf.fr/17056759/gustave_flaxland_editeur',
 'data.bnf.fr/en/14852692/amelie_legallois',
 'data.bnf.fr/10720282/edmond_de_martimprey',
 'data.bnf.fr/11117206/yves_le_trocquer',
 'data.bnf.fr/fr/11900968/bernard_dufosse']

### 65. catalogue.bnf.fr

No changes required.

In [177]:
list(df.loc[df['domain'] == 'catalogue.bnf.fr', 'URL_n'])[0:5]

['catalogue.bnf.fr/ark:/12148/cb12237668c',
 'catalogue.bnf.fr/ark:/12148/cb34418441f',
 'catalogue.bnf.fr/ark:/12148/cb125334311',
 'catalogue.bnf.fr/ark:/12148/cb11907656t',
 'catalogue.bnf.fr/ark:/12148/cb322671006/ISBD']

### 66. npgallery.nps.gov

One minor tag removed

In [178]:
list(df.loc[df['domain'] == 'npgallery.nps.gov', 'URL_n'])[0:5]

['npgallery.nps.gov/AssetDetail/NRIS/02000094',
 'npgallery.nps.gov/AssetDetail/NRIS/85001968',
 'npgallery.nps.gov/AssetDetail/6196891a-c658-40aa-9b52-ec622dae4202',
 'npgallery.nps.gov/GetAsset?assetID=8bd4eb28-fafe-4ed8-8c5a-c0b07e883c88',
 'npgallery.nps.gov/AssetDetail/NRIS/08001327%22']

In [179]:
df.loc[df['domain'] == 'npgallery.nps.gov', 'URL_n'] = [re.sub('#page.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'npgallery.nps.gov', 'URL_n']]

In [180]:
list(df.loc[df['domain'] == 'npgallery.nps.gov', 'URL_n'])[0:5]

['npgallery.nps.gov/AssetDetail/NRIS/02000094',
 'npgallery.nps.gov/AssetDetail/NRIS/85001968',
 'npgallery.nps.gov/AssetDetail/6196891a-c658-40aa-9b52-ec622dae4202',
 'npgallery.nps.gov/GetAsset?assetID=8bd4eb28-fafe-4ed8-8c5a-c0b07e883c88',
 'npgallery.nps.gov/AssetDetail/NRIS/08001327%22']

### 67. amigo.geneontology.org

One minor error is fixed.

In [181]:
list(df.loc[df['domain'] == 'amigo.geneontology.org', 'URL_n'])[0:5]

['amigo.geneontology.org/amigo/term/GO:0004252',
 'amigo.geneontology.org/',
 'amigo.geneontology.org/cgi-bin/amigo/blast.cgi',
 'amigo.geneontology.org/amigo/gene_product/UniProtKB:P68032',
 'amigo.geneontology.org/cgi-bin/amigo/term-assoc.cgi?action=filter&evcode=all&gptype=all&speciesdb=all&taxid=9606&term=GO:0035253&term_assocs=all']

In [182]:
df.loc[df['domain'] == 'amigo.geneontology.org', 'URL_n'] = [re.sub(',$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'amigo.geneontology.org', 'URL_n']]

In [183]:
list(df.loc[df['domain'] == 'amigo.geneontology.org', 'URL_n'])[0:5]

['amigo.geneontology.org/amigo/term/GO:0004252',
 'amigo.geneontology.org/',
 'amigo.geneontology.org/cgi-bin/amigo/blast.cgi',
 'amigo.geneontology.org/amigo/gene_product/UniProtKB:P68032',
 'amigo.geneontology.org/cgi-bin/amigo/term-assoc.cgi?action=filter&evcode=all&gptype=all&speciesdb=all&taxid=9606&term=GO:0035253&term_assocs=all']

### 68. irmng.org

No changes required.

In [184]:
list(df.loc[df['domain'] == 'irmng.org', 'URL_n'])[0:5]

['irmng.org/aphia.php?id=10234335&p=taxdetails',
 'irmng.org/aphia.php?id=1110082&p=taxdetails',
 'irmng.org/aphia.php?id=111246&p=taxdetails',
 'irmng.org/',
 'irmng.org/aphia.php?id=1297141&p=taxdetails']

### 69. ui.adsabs.harvard.edu

By default it goes to "abstract".

In [185]:
list(df.loc[df['domain'] == 'ui.adsabs.harvard.edu', 'URL_n'])[0:5]

['ui.adsabs.harvard.edu/abs/1993A&A...268..714C',
 'ui.adsabs.harvard.edu/2018AGUFM.P12A..06H/abstract',
 'ui.adsabs.harvard.edu/2013EGUGA..1512869C/abstract',
 'ui.adsabs.harvard.edu/abs/1913HarCi.179....1L/abstract',
 'ui.adsabs.harvard.edu/2003LRR.....6....5S/abstract']

In [186]:
df.loc[df['domain'] == 'ui.adsabs.harvard.edu', 'URL_n'] = [re.sub('/abstract$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'ui.adsabs.harvard.edu', 'URL_n']]

In [187]:
list(df.loc[df['domain'] == 'ui.adsabs.harvard.edu', 'URL_n'])[0:5]

['ui.adsabs.harvard.edu/abs/1993A&A...268..714C',
 'ui.adsabs.harvard.edu/2018AGUFM.P12A..06H',
 'ui.adsabs.harvard.edu/2013EGUGA..1512869C',
 'ui.adsabs.harvard.edu/abs/1913HarCi.179....1L',
 'ui.adsabs.harvard.edu/2003LRR.....6....5S']

### 70. inaturalist.org

No changes required.

In [188]:
list(df.loc[df['domain'] == 'inaturalist.org', 'URL_n'])[0:5]

['inaturalist.org/observations?place_id=1&taxon_id=123634',
 'inaturalist.org/observations/1630409',
 'inaturalist.org/taxa/141102-Dioscorea-orizabensis',
 'inaturalist.org/taxa/634314-Pyrops-karenius',
 'inaturalist.org/assessments/474-hippocampus-histrix']

### 71. eol.org

One parameter is removed.

In [189]:
list(df.loc[df['domain'] == 'eol.org', 'URL_n'])[0:5]

['eol.org/pages/80348/overview',
 'eol.org/pages/3775992/overview',
 'eol.org/pages/3200189/overview',
 'eol.org/pages/226633/overview',
 'eol.org/pages/3832567/overview']

In [190]:
df.loc[df['domain'] == 'eol.org', 'URL_n'] = [re.sub('\?category_id.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'eol.org', 'URL_n']]

In [191]:
list(df.loc[df['domain'] == 'eol.org', 'URL_n'])[0:5]

['eol.org/pages/80348/overview',
 'eol.org/pages/3775992/overview',
 'eol.org/pages/3200189/overview',
 'eol.org/pages/226633/overview',
 'eol.org/pages/3832567/overview']

### 72. idref.fr

No changes required.

In [192]:
list(df.loc[df['domain'] == 'idref.fr', 'URL_n'])[0:5]

['idref.fr/026930528',
 'idref.fr/03111170X',
 'idref.fr/073946540',
 'idref.fr/030573068',
 'idref.fr/056983204']

### 73. data.bibliotheken.nl

No changes required.

In [193]:
list(df.loc[df['domain'] == 'data.bibliotheken.nl', 'URL_n'])[0:5]

['data.bibliotheken.nl/doc/thes/p069373124',
 'data.bibliotheken.nl/doc/thes/p070549419']

### 74. id.worldcat.org

No changes required.

In [194]:
list(df.loc[df['domain'] == 'id.worldcat.org', 'URL_n'])[0:5]

['id.worldcat.org/fast/1532152']

### 75. aleph.nkp.cz

No changes required.

In [195]:
list(df.loc[df['domain'] == 'aleph.nkp.cz', 'URL_n'])[0:5]

['aleph.nkp.cz/F/?ccl_term=ica=mzk2009502406&func=find-c&local_base=aut',
 'aleph.nkp.cz/F/?CON_LNG=ENG&ccl_term=ica=ola2012681060&func=find-c&local_base=aut',
 'aleph.nkp.cz/F/1A499YXG83AQYGS6GL8T6XVVNGI11LQDMDX2UKVNMHQ4ETXT9N-12379?func=short-jump&jump=000001',
 'aleph.nkp.cz/F/Q1R11TLHQUE9GRN8DS5LLCKB7BXMG8NNNEKTTMKCBFL4DRXIFU-21478?acc_sequence=000233700&func=accref',
 'aleph.nkp.cz/F/?ccl_term=wau=jk01151786+or+wkw=jk01151786&func=find-c&local_base=nkc']

### 76. itis.gov

Some parameters are removed.

In [196]:
list(df.loc[df['domain'] == 'itis.gov', 'URL_n'])[0:5]

['itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=128079',
 'itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=108321',
 'itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=771729',
 'itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=771749',
 'itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=933828']

In [197]:
df.loc[df['domain'] == 'itis.gov', 'URL_n'] = [re.sub('#null$', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'itis.gov', 'URL_n']]
df.loc[df['domain'] == 'itis.gov', 'URL_n'] = [re.sub('(print_version=prt&|&source=(to_print|from_print|html)$)', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'itis.gov', 'URL_n']]

In [198]:
list(df.loc[df['domain'] == 'itis.gov', 'URL_n'])[0:5]

['itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=128079',
 'itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=108321',
 'itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=771729',
 'itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=771749',
 'itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=933828']

### 77. geonames.usgs.gov

Tags are removed.

In [199]:
list(df.loc[df['domain'] == 'geonames.usgs.gov', 'URL_n'])[0:5]

['geonames.usgs.gov/pls/gnispublic/f?p=132:3:4314539870447431::NO:3:P3_FID,P3_TITLE:1533909',
 'geonames.usgs.gov/pls/gnispublic/f?p=132:3:3417380616610539::NO:3:P3_FID,P3_TITLE:1506304',
 'geonames.usgs.gov/pls/gnispublic/f?p=132:3:3439465086258793::NO:3:P3_FID,P3_TITLE:1507015',
 'geonames.usgs.gov/pls/gnispublic/f?p=132:3:3439465086258793::NO::P3_FID,P3_TITLE:1524711',
 'geonames.usgs.gov/pls/gnispublic/f?p=gnispq:3:::NO::P3_FID:1199873']

In [200]:
df.loc[df['domain'] == 'geonames.usgs.gov', 'URL_n'] = [re.sub('#.*', '', x, flags=re.IGNORECASE) for x in df.loc[df['domain'] == 'geonames.usgs.gov', 'URL_n']]

In [201]:
list(df.loc[df['domain'] == 'geonames.usgs.gov', 'URL_n'])[0:5]

['geonames.usgs.gov/pls/gnispublic/f?p=132:3:4314539870447431::NO:3:P3_FID,P3_TITLE:1533909',
 'geonames.usgs.gov/pls/gnispublic/f?p=132:3:3417380616610539::NO:3:P3_FID,P3_TITLE:1506304',
 'geonames.usgs.gov/pls/gnispublic/f?p=132:3:3439465086258793::NO:3:P3_FID,P3_TITLE:1507015',
 'geonames.usgs.gov/pls/gnispublic/f?p=132:3:3439465086258793::NO::P3_FID,P3_TITLE:1524711',
 'geonames.usgs.gov/pls/gnispublic/f?p=gnispq:3:::NO::P3_FID:1199873']

### 78. geohack.toolforge.org

In [202]:
list(df.loc[df['domain'] == 'geohack.toolforge.org', 'URL_n'])[0:5]

[]

### 79. historicengland.org.uk

No changes required.

In [203]:
list(df.loc[df['domain'] == 'historicengland.org.uk', 'URL_n'])[0:5]

['historicengland.org.uk/listing/what-is-designation/listed-buildings',
 'historicengland.org.uk/listing/what-is-designation/listed-buildings',
 'historicengland.org.uk/listing/what-is-designation/listed-buildings',
 'historicengland.org.uk/listing/what-is-designation/listed-buildings',
 'historicengland.org.uk/listing/what-is-designation/listed-buildings']

### Final data processing

In [204]:
df['URL_n'] = [re.sub('(/+$|&+$|\?+$)', '', x) for x in df['URL_n']]
df['URL_n'] = [re.sub('(/+$|&+$|\?+$)', '', x) for x in df['URL_n']]
df['URL_n'] = [re.sub('(/+$|&+$|\?+$)', '', x) for x in df['URL_n']]

In [206]:
df = df.rename(columns={'id': 'page_id'})
df.head()

Unnamed: 0,page_id,URL,URL_n,domain
0,39495284,http://www.columbia.edu/cu/lweb/digital/collec...,columbia.edu/cu/lweb/digital/collections/cul/t...,columbia.edu
1,3528721,http://allafrica.com/stories/200512190886.html,allafrica.com/stories/200512190886.html,allafrica.com
3,8552398,https://www.nytimes.com/1997/07/27/magazine/th...,nytimes.com/1997/07/27/magazine/the-thin-red-l...,nytimes.com
4,23782313,https://www.nytimes.com/2005/07/19/arts/televi...,nytimes.com/2005/07/19/arts/television/19heff....,nytimes.com
6,17288329,http://www.ancientlibrary.com/smith-bio/0075.html,ancientlibrary.com/smith-bio/0075.html,ancientlibrary.com


There are 18,501,868 unique URLs.

In [207]:
len(df.groupby('URL_n').count().index)

18501868

In [208]:
df.to_csv('references/url_ref_norm.tsv', sep='\t', index=False)