# Example: find websites of National Statistical Institutes (NSIs) 

In [None]:
# Note: you need to have installed the urlfinding module in Anaconda, see Readme.txt
import urlfinding as uf
import pandas as pd

# You can ignore a FutureWarning about sklearn.metrics.classification being deprecated 

In [2]:
# Input data:
nsis = pd.read_csv('data/NSIs.csv', delimiter = ';')
nsis.head()

Unnamed: 0,id,tradename,legalname,address,postalcode,locality,country,phone
0,1,Danmarks Statistik,Danmarks Statistik,"Sejrøgade 11, Postboks 2550",2100,KØBENHAVN Ø,Denmark,(45) 3917 3917
1,2,Statistics Estonia,Statistics Estonia,Tatari 51,10134,Tallinn,Estonia,(372) 6259 300
2,3,"INSEE, Direction générale","INSEE, Direction générale",88 avenue Verdier - CS 70058,92541,Montrouge Cedex.,France,(33) 1 87 69 50 00
3,4,ISTAT,Istituto Nazionale di Statistica,Via Cesare Balbo,184,Roma,Italy,(39) 06 4673 2243-2244
4,5,Statistical Service of Cyprus,Statistical Service of Cyprus,Michalakis Karaolis street,1444,Nicosia,Cyprus,(357) 22 602 102


## Search via Google API:

In [None]:
# Note1: This takes a while (5 to 10 minute for this example).
#        For each record 6 queries are fired using an idle time in between.
#        During execution the search function displays which record is being processed.
# Note2: if you see a 403 error, you reached the maximum number of google queries for your API key.
# Note3: As documented in the API in the readme this search function starts at the next record from the last session. 
#        Hence if you want to start from scratch either remove the maxrownum file or set the number in that file to 0 

base_file    = './data/NSIs.csv'
googleconfig = './config/config.yml'
blacklist    = './data/blacklist.txt'
nrows        = len(nsis)                         # maximum number of rows to search 

uf.search(base_file, googleconfig, blacklist, nrows)


## Extract features from the search results:

In [None]:
date       = '20200115'
data_files = ['./data/20200115searchResult.csv'] # change this into the file created at the previous step
                                                 # in a multi search session you can use multiple searchResult files here
blacklist  = './data/blacklist.txt'

uf.extract(date, data_files, blacklist)

## Predict urls:

In [None]:
# Note: you might get a UserWarning about different versions of pickle estimators, which you can safely ignore.

model_file   = './data/model.pkl'
feature_file = './data/20200115features_agg.csv'
base_file    = './data/NSIs.csv'

uf.predict(feature_file, model_file, base_file)

In [None]:
# All results:
df = pd.read_csv('data/NSIs_url.csv', delimiter = ';')
df[['tradename', 'country', 'host', 'eqPred', 'pTrue']]

# The column host contains the predicted domain. At the moment only two level domains are supported.
# The column eqPred indicates whether the predicted url is the right one.
# The column pTrue shows the confidence of the prediction.

In [None]:
# Search results may vary depending on date and location from where you search.
# Here are the previously saved search results from a session executed on 20200115 from Statistics Netherlands:
df = pd.read_csv('data/NSIs_url_20200115.csv', delimiter = ';')
df[['tradename', 'country', 'host', 'eqPred', 'pTrue']]


## Train a model:

In [None]:
# This will be added later