## Creating functions from my eda to use within the final scripting

### packages

In [16]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

### scraping function

In [17]:
def scraper(code: str) -> pd.DataFrame:

    try:
        # fetch HTML content
        code = code.lower()
        url = f"https://service.unece.org/trade/locode/{code}.htm"
        r = requests.get(url)
        r.raise_for_status()

        # Parse the HTML
        soup = BeautifulSoup(r.text, 'html.parser')

        # Find all tables
        tables = soup.find_all('table')

        # Extracting table data
        data = []
        for table in tables:
            for row in table.find_all('tr'):
                row_data = []
                for td in row.find_all('td'):
                    row_data.append(td.text.strip())
                if row_data:
                    data.append(row_data)

        # Create the Data Frame
        df = pd.DataFrame(data)
        print(df)

        return df
    except Exception as e:
        print(f"An error occured: {e}")
        return pd.DataFrame()


In [18]:
df = scraper("NO")

                                                     0               1   \
0                                                        United Nations   
1     Code for Trade and Transport Locations  (UN/LO...            None   
2                                          (NO)  NORWAY            None   
3                                                    Ch          LOCODE   
4                                                               NO  AAA   
...                                                 ...             ...   
1143                                                            NO  VOS   
1144                                                            NO  VYG   
1145                                                            NO  VRE   
1146                                                            NO  YTR   
1147                                                            NO  YTO   

                2                 3       4         5       6     7     8   \
0             None   

### Adapting scraper function so that it can download a using a Dictionary

In [13]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


def scrape_and_save_tables(country_dict: dict) -> None:
    # Create a folder to store the CSV files
    if not os.path.exists("raw_country_data"):
        os.makedirs("raw_country_data")
    
    for country_code, country in tqdm(country_dict.items(), desc="Progress", total=len(country_dict)):
        country_code = country_code.lower()
        country = country.lower()
        url = f"https://service.unece.org/trade/locode/{country_code}.htm"
        response = requests.get(url)
        if response.status_code == 200:
            # Parse the HTML
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find all tables
            tables = soup.find_all('table')
            if tables:
                # Extracting table data
                data = []
                for table in tables:
                    for row in table.find_all('tr'):
                        row_data = []
                        for td in row.find_all('td'):
                            row_data.append(td.text.strip())
                        if row_data:
                            data.append(row_data)

                # Create the Data Frame
                df = pd.DataFrame(data)

                filename = f"raw_country_data/{country_code}_{country}.csv"
                df.to_csv(filename, index=False)
                print(f"Table scraped and saved for {country} as {filename}")
            else:
                print(f"No table found for {country}")
        else:
            print(f"Failed to fetch data for {country}")

### Creating processing functions

These can be within a class as static methods then can be used within the main.py script

In [19]:
from LatLon23 import LatLon
import re
import pandas as pd

In [134]:
df = pd.read_csv("src/raw_country_data/no_norway.csv")

In [135]:
def format_table(df: pd.DataFrame) -> pd.DataFrame:
    # Drop rows and set columns based on your existing logic
    df = df.drop(df.index[:3], inplace=False)
    df.columns = df.iloc[0]
    df = df.drop(3)
    
    # Filter rows where 'Function' column contains '1'
    df = df[df["Function"].str.contains("1")]
    
    # Reset index
    df.reset_index(drop=True, inplace=True)

    # Extract country code and location code from LOCODE column
    df[['country_code', 'location_code']] = df['LOCODE'].str.extract(r'^\s*(\S+)\s+(.*)$')
    
    # Removing columns that are not needed
    df = df[df.columns[~df.columns.isin(["Ch", "SubDiv", "Function", "Date", "Remarks", "IATA", "LOCODE"])]]

    # Reordering the columns
    country_code_col = df.pop('country_code')
    location_code_col = df.pop('location_code')
    df.insert(0, 'country_code', country_code_col)
    df.insert(1, 'location_code', location_code_col)
    
    return df


In [136]:
df = format_table(df)

In [137]:
df

3,country_code,location_code,Name,NameWoDiacritics,Status,Coordinates
0,NO,AAA,Å i Lofoten,A i Lofoten,AA,6753N 01259E
1,NO,ABE,Abelnes,Abelnes,AA,5814N 00640E
2,NO,ABV,Abelvær,Abelvar,AA,6444N 01111E
3,NO,AAF,Åfjord,Afjord,AI,
4,NO,AGD,Agdenes,Agdenes,AI,
...,...,...,...,...,...,...
735,NO,VDL,Vistdal,Vistdal,RQ,6243N 00756E
736,NO,VDA,Volda,Volda,AA,6208N 00603E
737,NO,VLL,Voll,Voll,RL,6812N 01341E
738,NO,SAG,Vormedal,Vormedal,RL,5921N 00519E


In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   country_code      740 non-null    object
 1   location_code     740 non-null    object
 2   Name              740 non-null    object
 3   NameWoDiacritics  740 non-null    object
 4   Status            738 non-null    object
 5   Coordinates       605 non-null    object
dtypes: object(6)
memory usage: 34.8+ KB


## Creating function to convert Coordinates to Lat and Lon

In [139]:
from LatLon23 import LatLon
import re

def parse_coordinate(df: pd.DataFrame) -> pd.DataFrame:
    coord_str= df['Coordinates']
    def parse(coord_str:str) -> tuple:    
        
        if coord_str:
            pattern = r'(\d{2})(\d{2})([NS])\s*(\d{3})(\d{2})([EW])'
            match = re.match(pattern, coord_str)
            if match:

                lat_deg, lat_min, lat_dir, lon_deg, lon_min, lon_dir = match.groups()
                lat = int(lat_deg) + int(lat_min) / 60 * (-1 if lat_dir == 'S' else 1)
                lon = int(lon_deg) + int(lon_min) / 60 * (-1 if lon_dir == 'W' else 1)
                
                # Use LatLon23 to convert decimal degrees
                latlon = LatLon(lat, lon)
                
                return latlon.lat.decimal_degree, latlon.lon.decimal_degree
            else:
                return None, None
        else:
            return None, None
        
    df[["Latitude", "Longitude"]] = df["Coordinates"].apply(parse).apply(pd.Series)
    return df

In [140]:
df = parse_coordinate(df)

TypeError: expected string or bytes-like object