# [NUTS statistical regions of Italy](https://en.wikipedia.org/wiki/NUTS_statistical_regions_of_Italy)

## Importing libraries

In [19]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from time import sleep

import requests
from bs4 import BeautifulSoup as sp

## Fetching the table and links

In [20]:
url = 'https://en.wikipedia.org/wiki/NUTS_statistical_regions_of_Italy'

In [22]:
r = requests.get(url)
r

<Response [200]>

In [23]:
soup = sp(r.content, 'html')

In [24]:
table = soup.find('div', {'class' : 'mw-parser-output'}).find_all('table')[1]

In [25]:
df = pd.DataFrame(columns = ['NUTS_1', 'Code_1', 'NUTS_2', 'Code_2', 'NUTS_3', 'Code_3', 'NUTS_2_Link', 'NUTS_3_Link'])

for idx, rows in enumerate(table.find_all('tr')):
    if idx != 0:
        if len(rows.find_all('td')) == 6:
            tetx_list, links_list = [], []
            
            for txt in rows.find_all('td'):
                tetx_list.append(txt.text.strip())
            for links in rows.find_all('a'):
                links_list.append(links['href'])
            tetx_list.extend([links_list[1], links_list[-2]])
            df.loc[len(df)] = tetx_list
            
        if len(rows.find_all('td')) == 2:
            tetx_list, links_list = [np.nan, np.nan, np.nan, np.nan], []
            
            for txt in rows.find_all('td'):
                tetx_list.append(txt.text.strip())
            for links in rows.find_all('a'):
                links_list.append(links['href'])
            tetx_list.extend([np.nan, links_list[0]])
            df.loc[len(df)] = tetx_list
            
        if len(rows.find_all('td')) == 4:
            tetx_list, links_list = [np.nan, np.nan], []
            
            for txt in rows.find_all('td'):
                tetx_list.append(txt.text.strip())
            for links in rows.find_all('a'):
                links_list.append(links['href'])
            tetx_list.extend([links_list[0], links_list[-2]])
            df.loc[len(df)] = tetx_list

df.head(5)

Unnamed: 0,NUTS_1,Code_1,NUTS_2,Code_2,NUTS_3,Code_3,NUTS_2_Link,NUTS_3_Link
0,Northwest Italy,ITC,Piemonte,ITC1,Torino,ITC11,/wiki/File:Flag_of_Piedmont.svg,/wiki/File:Flag_of_the_Province_of_Turin.svg
1,,,,,Vercelli,ITC12,,/wiki/File:Image_unavailable.png
2,,,,,Biella,ITC13,,/wiki/File:Image_unavailable.png
3,,,,,Verbano-Cusio-Ossola,ITC14,,/wiki/File:Flag_of_the_Province_of_Verbano-Cus...
4,,,,,Novara,ITC15,,/wiki/File:Flag_of_the_Province_of_Novara.gif


In [26]:
df1 = df.ffill().copy()
df1

Unnamed: 0,NUTS_1,Code_1,NUTS_2,Code_2,NUTS_3,Code_3,NUTS_2_Link,NUTS_3_Link
0,Northwest Italy,ITC,Piemonte,ITC1,Torino,ITC11,/wiki/File:Flag_of_Piedmont.svg,/wiki/File:Flag_of_the_Province_of_Turin.svg
1,Northwest Italy,ITC,Piemonte,ITC1,Vercelli,ITC12,/wiki/File:Flag_of_Piedmont.svg,/wiki/File:Image_unavailable.png
2,Northwest Italy,ITC,Piemonte,ITC1,Biella,ITC13,/wiki/File:Flag_of_Piedmont.svg,/wiki/File:Image_unavailable.png
3,Northwest Italy,ITC,Piemonte,ITC1,Verbano-Cusio-Ossola,ITC14,/wiki/File:Flag_of_Piedmont.svg,/wiki/File:Flag_of_the_Province_of_Verbano-Cus...
4,Northwest Italy,ITC,Piemonte,ITC1,Novara,ITC15,/wiki/File:Flag_of_Piedmont.svg,/wiki/File:Flag_of_the_Province_of_Novara.gif
...,...,...,...,...,...,...,...,...
104,Central Italy,ITI,Lazio,ITI4,Viterbo,ITI41,/wiki/File:Flag_of_Lazio.svg,/wiki/File:Flag_of_the_Province_of_Viterbo.gif
105,Central Italy,ITI,Lazio,ITI4,Rieti,ITI42,/wiki/File:Flag_of_Lazio.svg,/wiki/File:Flag_of_the_province_of_Rieti.svg
106,Central Italy,ITI,Lazio,ITI4,Roma,ITI43,/wiki/File:Flag_of_Lazio.svg,/wiki/File:Flag_of_the_Province_of_Rome.svg
107,Central Italy,ITI,Lazio,ITI4,Latina,ITI44,/wiki/File:Flag_of_Lazio.svg,/wiki/File:Flag_of_Latina.png


In [28]:
df = df.ffill().copy()

BASE_URL = 'https://en.wikipedia.org/'
df.NUTS_2_Link = BASE_URL + df.NUTS_2_Link.map(str)
df.NUTS_3_Link = BASE_URL + df.NUTS_3_Link.map(str)

df.head(50)

Unnamed: 0,NUTS_1,Code_1,NUTS_2,Code_2,NUTS_3,Code_3,NUTS_2_Link,NUTS_3_Link
0,Northwest Italy,ITC,Piemonte,ITC1,Torino,ITC11,https://en.wikipedia.org//wiki/File:Flag_of_Pi...,https://en.wikipedia.org//wiki/File:Flag_of_th...
1,Northwest Italy,ITC,Piemonte,ITC1,Vercelli,ITC12,https://en.wikipedia.org//wiki/File:Flag_of_Pi...,https://en.wikipedia.org//wiki/File:Image_unav...
2,Northwest Italy,ITC,Piemonte,ITC1,Biella,ITC13,https://en.wikipedia.org//wiki/File:Flag_of_Pi...,https://en.wikipedia.org//wiki/File:Image_unav...
3,Northwest Italy,ITC,Piemonte,ITC1,Verbano-Cusio-Ossola,ITC14,https://en.wikipedia.org//wiki/File:Flag_of_Pi...,https://en.wikipedia.org//wiki/File:Flag_of_th...
4,Northwest Italy,ITC,Piemonte,ITC1,Novara,ITC15,https://en.wikipedia.org//wiki/File:Flag_of_Pi...,https://en.wikipedia.org//wiki/File:Flag_of_th...
5,Northwest Italy,ITC,Piemonte,ITC1,Cuneo,ITC16,https://en.wikipedia.org//wiki/File:Flag_of_Pi...,https://en.wikipedia.org//wiki/File:Flag_of_th...
6,Northwest Italy,ITC,Piemonte,ITC1,Asti,ITC17,https://en.wikipedia.org//wiki/File:Flag_of_Pi...,https://en.wikipedia.org//wiki/File:Flag_of_th...
7,Northwest Italy,ITC,Piemonte,ITC1,Alessandria,ITC18,https://en.wikipedia.org//wiki/File:Flag_of_Pi...,https://en.wikipedia.org//wiki/File:Flag_of_th...
8,Northwest Italy,ITC,Valle d'Aosta,ITC2,Aosta,ITC20,https://en.wikipedia.org//wiki/File:Flag_of_Va...,https://en.wikipedia.org//wiki/File:Flag_of_Va...
9,Northwest Italy,ITC,Liguria,ITC3,Imperia,ITC31,https://en.wikipedia.org//wiki/File:Flag_of_Li...,https://en.wikipedia.org//wiki/File:Flag_of_th...


## Fetching downloadable image urls

In [30]:
NUTS_2_df = pd.DataFrame(columns = ['NUTS_2_Link', 'NUTS_2_Images'])
for link in tqdm(df.NUTS_2_Link.unique()):
    r = requests.get(link)
    soup = sp(r.content, 'html')
    try:
        img_link  = 'https:' + soup.find('div', {'class' : 'fullImageLink'}).find('a')['href']
    except:
        img_link = np.nan
    NUTS_2_df.loc[len(NUTS_2_df)] = [link, img_link]
    sleep(3)

100%|██████████| 20/20 [01:18<00:00,  3.91s/it]


In [34]:
imgs = NUTS_2_df['NUTS_2_Images'].tolist()
imgs

['https://upload.wikimedia.org/wikipedia/commons/b/b9/Flag_of_Piedmont.svg',
 'https://upload.wikimedia.org/wikipedia/commons/9/90/Flag_of_Valle_d%27Aosta.svg',
 'https://upload.wikimedia.org/wikipedia/commons/8/88/Flag_of_Liguria.svg',
 'https://upload.wikimedia.org/wikipedia/commons/e/ea/Flag_of_Lombardy.svg',
 'https://upload.wikimedia.org/wikipedia/commons/4/45/Flag_of_Abruzzo.svg',
 'https://upload.wikimedia.org/wikipedia/commons/8/84/Flag_of_Molise.svg',
 'https://upload.wikimedia.org/wikipedia/commons/c/c5/Flag_of_Campania.svg',
 'https://upload.wikimedia.org/wikipedia/commons/b/b8/Flag_of_Apulia.svg',
 'https://upload.wikimedia.org/wikipedia/commons/8/8e/Flag_of_Basilicata.svg',
 'https://upload.wikimedia.org/wikipedia/commons/8/8b/Flag_of_Calabria.svg',
 'https://upload.wikimedia.org/wikipedia/commons/8/84/Sicilian_Flag.svg',
 'https://upload.wikimedia.org/wikipedia/commons/4/4e/Flag_of_Sardinia%2C_Italy.svg',
 'https://upload.wikimedia.org/wikipedia/commons/7/7f/Flag_of_Trent

In [36]:
NUTS_3_df = pd.DataFrame(columns = ['NUTS_3_Link', 'NUTS_3_Images'])
for link in tqdm(df.NUTS_3_Link.unique()):
    r = requests.get(link)
    soup = sp(r.content, 'html')
    try:
        img_link  = 'https:' + soup.find('div', {'class' : 'fullImageLink'}).find('a')['href']
    except:
        img_link = np.nan
    NUTS_3_df.loc[len(NUTS_3_df)] = [link, img_link]
    sleep(3)

100%|██████████| 94/94 [05:46<00:00,  3.68s/it]


In [39]:
NUTS_3_df['NUTS_3_Images'].to_list()

['https://upload.wikimedia.org/wikipedia/commons/e/eb/Flag_of_the_Province_of_Turin.svg',
 'https://upload.wikimedia.org/wikipedia/commons/6/62/Image_unavailable.png',
 'https://upload.wikimedia.org/wikipedia/commons/9/99/Flag_of_the_Province_of_Verbano-Cusio-Ossola.gif',
 'https://upload.wikimedia.org/wikipedia/commons/4/49/Flag_of_the_Province_of_Novara.gif',
 'https://upload.wikimedia.org/wikipedia/commons/c/c8/Flag_of_the_Province_of_Cuneo.svg',
 'https://upload.wikimedia.org/wikipedia/commons/b/b4/Flag_of_the_Province_of_Asti.svg',
 'https://upload.wikimedia.org/wikipedia/commons/d/db/Flag_of_the_Province_of_Alessandria.svg',
 'https://upload.wikimedia.org/wikipedia/commons/9/90/Flag_of_Valle_d%27Aosta.svg',
 'https://upload.wikimedia.org/wikipedia/commons/9/9b/Flag_of_the_Province_of_Imperia.svg',
 'https://upload.wikimedia.org/wikipedia/commons/6/6f/Flag_of_the_Province_of_Savona.gif',
 'https://upload.wikimedia.org/wikipedia/commons/7/73/Flag_of_Genoa.svg',
 'https://upload.wik

## Merging image links to the table dataframe

In [10]:
df = df.merge(NUTS_2_df, how = 'right').copy()

In [11]:
df = df.merge(NUTS_3_df, how = 'right').copy()

## Filling missing values manually

In [12]:
df[df.isnull().any(axis = 1)]

Unnamed: 0,NUTS_1,Code_1,NUTS_2,Code_2,NUTS_3,Code_3,NUTS_2_Link,NUTS_3_Link,NUTS_2_Images,NUTS_3_Images
71,Northeast Italy,ITH,Veneto,ITH3,Belluno,ITH33,https://en.wikipedia.org//wiki/File:Flag_of_Ve...,https://en.wikipedia.org//wiki/Province_of_Bel...,https://upload.wikimedia.org/wikipedia/commons...,
79,Northeast Italy,ITH,Friuli-Venezia Giulia,ITH4,Trieste,ITH44,https://en.wikipedia.org//wiki/File:Flag_of_Fr...,https://en.wikipedia.org//wiki/Province_of_Tri...,https://upload.wikimedia.org/wikipedia/commons...,


In [13]:
df.loc[71].NUTS_3_Images = 'https://upload.wikimedia.org/wikipedia/en/6/65/Provincia_di_Belluno-Stemma.png'

In [14]:
df[df.isnull().any(axis = 1)]

Unnamed: 0,NUTS_1,Code_1,NUTS_2,Code_2,NUTS_3,Code_3,NUTS_2_Link,NUTS_3_Link,NUTS_2_Images,NUTS_3_Images
79,Northeast Italy,ITH,Friuli-Venezia Giulia,ITH4,Trieste,ITH44,https://en.wikipedia.org//wiki/File:Flag_of_Fr...,https://en.wikipedia.org//wiki/Province_of_Tri...,https://upload.wikimedia.org/wikipedia/commons...,


## Saving to CSV file

In [15]:
df.to_csv('NUTS_Code_RAW.csv', index = False)
df[['NUTS_1', 'Code_1', 'NUTS_2', 'Code_2', 'NUTS_3', 'Code_3', 'NUTS_2_Images', 'NUTS_3_Images']].to_csv('NUTS_Code_Processed.csv', index = False)

# Contributer info

<p><strong>Jishnu S G</strong></p>

<ul>
  <li><p><a href = 'https://linkedin.com/in/jishnukoliyadan/'>Linkedin</a></p></li>
  <li><p><a href = 'https://github.com/jishnukoliyadan'>GitHub</a></p></li>
</ul>  

<p>
    <a href = 'https://www.buymeacoffee.com/jishnukoliyadan'>
    <img src = 'https://www.buymeacoffee.com/assets/img/guidelines/download-assets-sm-1.svg' alt = 'https://www.buymeacoffee.com/jishnukoliyadan' width = 11%></a>
</p>