## Creating functions from my eda to use within the final scripting

### packages

In [16]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

### scraping function

In [17]:
def scraper(code: str) -> pd.DataFrame:

    try:
        # fetch HTML content
        code = code.lower()
        url = f"https://service.unece.org/trade/locode/{code}.htm"
        r = requests.get(url)
        r.raise_for_status()

        # Parse the HTML
        soup = BeautifulSoup(r.text, 'html.parser')

        # Find all tables
        tables = soup.find_all('table')

        # Extracting table data
        data = []
        for table in tables:
            for row in table.find_all('tr'):
                row_data = []
                for td in row.find_all('td'):
                    row_data.append(td.text.strip())
                if row_data:
                    data.append(row_data)

        # Create the Data Frame
        df = pd.DataFrame(data)
        print(df)

        return df
    except Exception as e:
        print(f"An error occured: {e}")
        return pd.DataFrame()


In [18]:
df = scraper("NO")

                                                     0               1   \
0                                                        United Nations   
1     Code for Trade and Transport Locations  (UN/LO...            None   
2                                          (NO)  NORWAY            None   
3                                                    Ch          LOCODE   
4                                                               NO  AAA   
...                                                 ...             ...   
1143                                                            NO  VOS   
1144                                                            NO  VYG   
1145                                                            NO  VRE   
1146                                                            NO  YTR   
1147                                                            NO  YTO   

                2                 3       4         5       6     7     8   \
0             None   

### Adapting scraper function so that it can download a using a Dictionary

In [13]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


def scrape_and_save_tables(country_dict: dict) -> None:
    # Create a folder to store the CSV files
    if not os.path.exists("raw_country_data"):
        os.makedirs("raw_country_data")
    
    for country_code, country in tqdm(country_dict.items(), desc="Progress", total=len(country_dict)):
        country_code = country_code.lower()
        country = country.lower()
        url = f"https://service.unece.org/trade/locode/{country_code}.htm"
        response = requests.get(url)
        if response.status_code == 200:
            # Parse the HTML
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find all tables
            tables = soup.find_all('table')
            if tables:
                # Extracting table data
                data = []
                for table in tables:
                    for row in table.find_all('tr'):
                        row_data = []
                        for td in row.find_all('td'):
                            row_data.append(td.text.strip())
                        if row_data:
                            data.append(row_data)

                # Create the Data Frame
                df = pd.DataFrame(data)

                filename = f"raw_country_data/{country_code}_{country}.csv"
                df.to_csv(filename, index=False)
                print(f"Table scraped and saved for {country} as {filename}")
            else:
                print(f"No table found for {country}")
        else:
            print(f"Failed to fetch data for {country}")

### Creating processing functions

These can be within a class as static methods then can be used within the main.py script

In [61]:
from LatLon23 import LatLon
import re
import pandas as pd
from geopy.geocoders import Nominatim

In [49]:
df = pd.read_csv("src/raw_country_data/no_norway.csv")

In [51]:
def format_table(df: pd.DataFrame) -> pd.DataFrame:
    # Drop rows and set columns based on your existing logic
    df = df.drop(df.index[:3], inplace=False)
    df.columns = df.iloc[0]
    df = df.drop(3)
    
    # Filter rows where 'Function' column contains '1'
    df = df[df["Function"].str.contains("1")]
    
    # Reset index
    df.reset_index(drop=True, inplace=True)

    # Extract country code and location code from LOCODE column
    df[['Country_code', 'Location_code']] = df['LOCODE'].str.extract(r'^\s*(\S+)\s+(.*)$')
    
    # Removing columns that are not needed
    df = df[df.columns[~df.columns.isin(["Ch", "SubDiv", "Function", "Date", "Remarks", "IATA", "LOCODE"])]]

    # Reordering the columns
    country_code_col = df.pop('Country_code')
    location_code_col = df.pop('Location_code')
    df.insert(0, 'Country_code', country_code_col)
    df.insert(1, 'Location_code', location_code_col)
    
    return df


In [52]:
df = format_table(df)

In [53]:
df

3,Country_code,Location_code,Name,NameWoDiacritics,Status,Coordinates
0,NO,AAA,Å i Lofoten,A i Lofoten,AA,6753N 01259E
1,NO,ABE,Abelnes,Abelnes,AA,5814N 00640E
2,NO,ABV,Abelvær,Abelvar,AA,6444N 01111E
3,NO,AAF,Åfjord,Afjord,AI,
4,NO,AGD,Agdenes,Agdenes,AI,
...,...,...,...,...,...,...
735,NO,VDL,Vistdal,Vistdal,RQ,6243N 00756E
736,NO,VDA,Volda,Volda,AA,6208N 00603E
737,NO,VLL,Voll,Voll,RL,6812N 01341E
738,NO,SAG,Vormedal,Vormedal,RL,5921N 00519E


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Country_code      740 non-null    object
 1   Location_code     740 non-null    object
 2   Name              740 non-null    object
 3   NameWoDiacritics  740 non-null    object
 4   Status            738 non-null    object
 5   Coordinates       605 non-null    object
dtypes: object(6)
memory usage: 34.8+ KB


## Creating function to convert Coordinates to Lat and Lon

In [55]:
from LatLon23 import LatLon
import re

def parse_coordinate(coord_str):
    
    
    if  isinstance(coord_str, str):
        pattern = r'(\d{2})(\d{2})([NS])\s*(\d{3})(\d{2})([EW])'
        match = re.match(pattern, coord_str)
        if match:

            lat_deg, lat_min, lat_dir, lon_deg, lon_min, lon_dir = match.groups()
            lat = int(lat_deg) + int(lat_min) / 60 * (-1 if lat_dir == 'S' else 1)
            lon = int(lon_deg) + int(lon_min) / 60 * (-1 if lon_dir == 'W' else 1)
            
            # Use LatLon23 to convert decimal degrees
            latlon = LatLon(lat, lon)
            
            return latlon.lat.decimal_degree, latlon.lon.decimal_degree
        return None, None

In [56]:
# Apply the parse_coordinate function to extract latitude and longitude
coordinates_parsed = df['Coordinates'].apply(parse_coordinate).apply(pd.Series)

# Add the parsed coordinates to the DataFrame
df[['Latitude', 'Longitude']] = coordinates_parsed

# Filter out non-string values in the 'Coordinates' column
non_string_indices = df[df['Coordinates'].apply(lambda x: not isinstance(x, str))].index

# Assign None to Latitude and Longitude for rows with non-string coordinates
df.loc[non_string_indices, ['Latitude', 'Longitude']] = None

In [57]:
df.head()

3,Country_code,Location_code,Name,NameWoDiacritics,Status,Coordinates,Latitude,Longitude
0,NO,AAA,Å i Lofoten,A i Lofoten,AA,6753N 01259E,67.883333,12.983333
1,NO,ABE,Abelnes,Abelnes,AA,5814N 00640E,58.233333,6.666667
2,NO,ABV,Abelvær,Abelvar,AA,6444N 01111E,64.733333,11.183333
3,NO,AAF,Åfjord,Afjord,AI,,,
4,NO,AGD,Agdenes,Agdenes,AI,,,


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country_code      740 non-null    object 
 1   Location_code     740 non-null    object 
 2   Name              740 non-null    object 
 3   NameWoDiacritics  740 non-null    object 
 4   Status            738 non-null    object 
 5   Coordinates       605 non-null    object 
 6   Latitude          605 non-null    float64
 7   Longitude         605 non-null    float64
dtypes: float64(2), object(6)
memory usage: 46.4+ KB


## creating geo location function

In [68]:
def geolocate(df:pd.DataFrame):
    
    geolocator = Nominatim(user_agent="Harbour_locations", timeout=10)

    # creating dataframe with nan values in lat and lon columns
    missing_coord = df[df["Latitude"].isnull() | df["Longitude"].isnull()].copy()

    # combining Name and Lo columns to create a location query
    missing_coord["query"] = missing_coord["Name"] + ", " + missing_coord["Country_code"]

    # Apply geocoding
    missing_coord["coords"] = missing_coord["query"].apply(geolocator.geocode)

    #Extract Lat and Lon from coords
    missing_coord["Lat"] = missing_coord["coords"].apply(lambda x: x.latitude if x else None)
    missing_coord["Lon"] = missing_coord["coords"].apply(lambda x: x.longitude if x else None)

    # Merge the missing coord with the df based on Name
    merged_df = pd.merge(df, missing_coord[["Name", "Lat", "Lon"]], on="Name", how="left", suffixes=('', '_missing'))

    # Update Latitude and Longitude columns in the original dataframe
    df['Latitude'] = merged_df['Lat'].fillna(df['Latitude']).astype(float)
    df['Longitude'] = merged_df['Lon'].fillna(df['Longitude']).astype(float)


    return df

In [69]:
df = geolocate(df)
df.head()

3,Country_code,Location_code,Name,NameWoDiacritics,Status,Coordinates,Latitude,Longitude
0,NO,AAA,Å i Lofoten,A i Lofoten,AA,6753N 01259E,67.883333,12.983333
1,NO,ABE,Abelnes,Abelnes,AA,5814N 00640E,58.233333,6.666667
2,NO,ABV,Abelvær,Abelvar,AA,6444N 01111E,64.733333,11.183333
3,NO,AAF,Åfjord,Afjord,AI,,64.083563,10.219908
4,NO,AGD,Agdenes,Agdenes,AI,,63.583247,9.515259


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country_code      740 non-null    object 
 1   Location_code     740 non-null    object 
 2   Name              740 non-null    object 
 3   NameWoDiacritics  740 non-null    object 
 4   Status            738 non-null    object 
 5   Coordinates       605 non-null    object 
 6   Latitude          739 non-null    float64
 7   Longitude         739 non-null    float64
dtypes: float64(2), object(6)
memory usage: 46.4+ KB


In [72]:
from ipyleaflet import Map, basemaps, Marker, Popup
from ipywidgets import HTML
import numpy as np

valid_lat = df["Latitude"].dropna()
valid_long = df["Longitude"].dropna()

average_latitude = np.nanmean(valid_lat)
average_longitude = np.nanmean(valid_long)


center = (average_latitude, average_longitude)
zoom = 4

m= Map(basemap= basemaps.Esri.WorldStreetMap, center=center, zoom=zoom, scroll_wheel_zoom= True)

for index, row in df.iterrows():
    marker = Marker(location=(row["Latitude"], row["Longitude"],), draggable=False)
    popup_content = "<b>{}</b><br>Coordinates: ({})<br>Latitude: {}<br>Longitude: ".format(row['Name'], row['Coordinates'], row['Latitude'], row['Longitude'])
    html = HTML(value=popup_content)
    popup = Popup(
        location=(row["Latitude"], row["Longitude"]),
        child=html
    )
    marker.popup = popup
    m.add_layer(marker)
m



Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant
  content = self.pack(content)


Map(center=[63.48836162728913, 10.836725240535102], controls=(ZoomControl(options=['position', 'zoom_in_text',…