# Example: find websites of National Statistical Institutes (NSIs) 

In [1]:
# Note: you need to have installed the urlfinding module in Anaconda, see Readme.txt
import urlfinding as uf
import pandas as pd

# You can ignore a FutureWarning about sklearn.metrics.classification being deprecated 

population_path    = './data/NSIs.csv'
googleconfig = './config/config.yml'
mappings     = './config/mappings.yml'
blacklist    = './data/blacklist.txt'
nrows        = len(nsis)                         # maximum number of rows to search 

In [2]:
# Input data:
nsis = pd.read_csv(population_path, delimiter = ';')
nsis.head()

Unnamed: 0,id,tradename,legalname,address,postalcode,locality,country,phone
0,1,Danmarks Statistik,Danmarks Statistik,"Sejrøgade 11, Postboks 2550",2100,KØBENHAVN Ø,Denmark,(45) 3917 3917
1,2,Statistics Estonia,Statistics Estonia,Tatari 51,10134,Tallinn,Estonia,(372) 6259 300
2,3,"INSEE, Direction générale","INSEE, Direction générale",88 avenue Verdier - CS 70058,92541,Montrouge Cedex.,France,(33) 1 87 69 50 00
3,4,ISTAT,Istituto Nazionale di Statistica,Via Cesare Balbo,184,Roma,Italy,(39) 06 4673 2243-2244
4,5,Statistical Service of Cyprus,Statistical Service of Cyprus,Michalakis Karaolis street,1444,Nicosia,Cyprus,(357) 22 602 102


## Search via Google API:

In [4]:
# Note1: You have to add your Google API key and searchengineid to the /examples/config/config.yml first.
# Note2: This takes a while (5 to 10 minute for this example).
#        For each record 6 queries are fired using an idle time in between.
#        During execution the search function displays which record is being processed.
# Note3: if you see a 403 error, you reached the maximum number of google queries for your API key.
# Note4: As documented in the API in the readme this search function starts at the next record from the last session. 
#        Hence if you want to start from scratch either remove the maxrownum file or set the number in that file to 0 

nrows        = len(nsis)       

urlfinder = uf.UrlFinder.from_paths(
    url_finder_config_path=googleconfig,
    mappings_path=mappings,
    population_path=population_path,
    working_directory=None,
    classifier_path=None,
    url_blacklist_path=blacklist
)

urlfinder.searcher.run(nrows, population_path)
#ur_search = uf.search(
#    population_path=base_file, 
#    mappings_path=googleconfig) #, blacklist, nrows)




## Extract features from the search results:

In [6]:
date       = '20251010'
data_files = [f'./data/{date}searchResult.csv'] # change this into the file created at the previous step
                                                 # in a multi search session you can use multiple searchResult files here

urlfinder.extractor.run(date, data_files, False)

Created feature file D:\Scrape\urlfinding\examples\data\20251009features.csv


## Predict urls:

In [None]:
# Note: you might get a UserWarning about different versions of pickle estimators, which you can safely ignore.

date             = '20251013'
model_file   = './data/model.pkl'
feature_file = f'./data/{date}features.csv'
results = urlfinder.url_classifier.predict(feature_file, model_file, population_path)
results