In [2]:
pip install pandas shapely folium

Collecting shapely
  Downloading shapely-2.1.1-cp313-cp313-win_amd64.whl.metadata (7.0 kB)
Collecting folium
  Downloading folium-0.20.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting branca>=0.6.0 (from folium)
  Downloading branca-0.8.1-py3-none-any.whl.metadata (1.5 kB)
Downloading shapely-2.1.1-cp313-cp313-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 1.7/1.7 MB 51.4 MB/s eta 0:00:00
Downloading folium-0.20.0-py2.py3-none-any.whl (113 kB)
Downloading branca-0.8.1-py3-none-any.whl (26 kB)
Installing collected packages: shapely, branca, folium

   ---------------------------------------- 0/3 [shapely]
   ---------------------------------------- 0/3 [shapely]
   ---------------------------------------- 0/3 [shapely]
   ---------------------------------------- 0/3 [shapely]
   ---------------------------------------- 0/3 [shapely]
   ---------------------------------------- 0/3 [shapely]
   ---

**Most frequent job archetype**

**Per capita**

In [7]:
import pandas as pd
import io
import json
import folium
from folium.features import GeoJsonTooltip
import sys
import branca.colormap as cm

# --- Configuration ---
REGION_NAME_COLUMN = 'ITL121NM'
JOB_LISTINGS_FILE = "master_enriched_job_listings_dataset.csv"
POPULATION_FILE = "Population ONS.csv"
UK_BOUNDARIES_FILE = "uk_boundaries.geojson"
OUTPUT_MAP_FILE = "uk_job_archetypes_map_normalized.html"

# Define the job archetype to exclude
EXCLUDE_JOB_ARCHETYPE = 'Noise / General Business Role'

def create_interactive_map(job_listings_filepath, population_filepath, uk_boundaries_filepath):
    """
    Creates an interactive Folium map showing the most frequent job archetype per UK region,
    based on a normalized count (jobs per 1,000 people).

    Args:
        job_listings_filepath (str): The file path to the master_enriched_job_listings.csv file.
        population_filepath (str): The file path to the Population ONS.csv file.
        uk_boundaries_filepath (str): The file path to the UK boundaries GeoJSON file.

    Returns:
        str: A message indicating the status of the map generation.
    """
    try:
        # --- 1. Process Job Listings Data ---
        print(f"Processing job listings data from {job_listings_filepath}...")
        job_listings_df = pd.read_csv(job_listings_filepath)

        if 'region' not in job_listings_df.columns or 'kmeans_job_archetype' not in job_listings_df.columns:
            raise ValueError("Job listings CSV must contain 'region' and 'kmeans_job_archetype' columns.")

        # --- Standardize Region Names in Job Listings ---
        initial_east_of_england_count = (job_listings_df['region'] == 'East of England').sum()
        job_listings_df['region'] = job_listings_df['region'].replace('East of England', 'East')
        print(f"Standardized region names: Replaced 'East of England' with 'East' for {initial_east_of_england_count} entries.")

        # --- EXCLUDE 'Unclassified/Noise' CATEGORY ---
        initial_rows = len(job_listings_df)
        job_listings_df = job_listings_df[job_listings_df['kmeans_job_archetype'] != EXCLUDE_JOB_ARCHETYPE]
        filtered_rows = len(job_listings_df)
        print(f"Excluded '{EXCLUDE_JOB_ARCHETYPE}' category. Removed {initial_rows - filtered_rows} rows.")

        # --- 2. Process Population Data ---
        print(f"Processing population data from {population_filepath}...")
        df_population = pd.read_csv(population_filepath)

        # Clean and convert the 'Population' column
        df_population['Population'] = df_population['Population'].astype(str).str.replace(',', '', regex=False)
        df_population['Population'] = pd.to_numeric(df_population['Population'])

        # --- Standardize Region Names in Population Data to match Job Listings ---
        initial_east_of_england_pop_count = (df_population['Region'] == 'East of England').sum()
        df_population['Region'] = df_population['Region'].replace('East of England', 'East')
        print(f"Standardized population region names: Replaced 'East of England' with 'East' for {initial_east_of_england_pop_count} entries.")
        
        # --- 3. Calculate Normalized Metrics per Archetype and Region ---
        print("Calculating normalized job archetype counts...")
        # Group by region and archetype to get raw counts
        job_archetype_counts = job_listings_df.groupby(['region', 'kmeans_job_archetype']).size().reset_index(name='raw_count')
        
        # Merge the counts with the population data
        merged_df = pd.merge(job_archetype_counts, df_population[['Region', 'Population']], left_on='region', right_on='Region', how='left')

        # Calculate the normalized count (per 1,000 people)
        merged_df['normalized_count'] = (merged_df['raw_count'] / merged_df['Population'].fillna(1)) * 1000

        # Find the most frequent archetype per region based on the **normalized count**
        most_frequent_archetype = merged_df.loc[merged_df.groupby('region')['normalized_count'].idxmax()]
        
        # Prepare the data for the map
        job_archetype_map = most_frequent_archetype.set_index('region')[['kmeans_job_archetype', 'normalized_count']].to_dict('index')
        print("Job archetypes processed using normalized data.")

        # --- 4. Process UK Boundaries Data from GeoJSON ---
        print(f"Processing UK boundaries data from {uk_boundaries_filepath}...")
        with open(uk_boundaries_filepath, 'r', encoding='utf-8') as f:
            geojson_data = json.load(f)

        if "features" not in geojson_data or not isinstance(geojson_data["features"], list):
            raise ValueError("Invalid GeoJSON structure: 'features' array not found.")

        # --- 5. Merge Data into GeoJSON Properties ---
        print("Merging data into GeoJSON properties...")
        processed_features_count = 0
        for feature in geojson_data["features"]:
            if "properties" in feature and REGION_NAME_COLUMN in feature["properties"]:
                region_name = feature["properties"][REGION_NAME_COLUMN]
                archetype_data = job_archetype_map.get(region_name, None)
                if archetype_data:
                    feature["properties"]["Most Frequent Job Archetype"] = archetype_data['kmeans_job_archetype']
                    feature["properties"]["Normalized Job Count (per 1k)"] = round(archetype_data['normalized_count'], 2)
                else:
                    feature["properties"]["Most Frequent Job Archetype"] = "N/A"
                    feature["properties"]["Normalized Job Count (per 1k)"] = 0
                processed_features_count += 1
            else:
                print(f"Warning: Feature missing 'properties' or '{REGION_NAME_COLUMN}' key. Skipping feature.", file=sys.stderr)

        print(f"Merged data for {processed_features_count} GeoJSON features.")

        # --- 6. Create Interactive Folium Map ---
        print("Creating Folium map...")
        uk_center = [55.0, -3.0]
        m = folium.Map(location=uk_center, zoom_start=6)

        # Create a color map for job archetypes
        unique_archetypes = sorted(list(set(item['kmeans_job_archetype'] for item in job_archetype_map.values()) - {'N/A'}))
        colors = [
            '#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c',
            '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a', '#ffff99', '#b15928'
        ]
        if len(unique_archetypes) > len(colors):
            print(f"Warning: Not enough distinct colors for all {len(unique_archetypes)} archetypes. Colors will repeat.", file=sys.stderr)
        
        archetype_color_map = {archetype: colors[i % len(colors)] for i, archetype in enumerate(unique_archetypes)}
        archetype_color_map['N/A'] = '#cccccc'
        
        colormap = cm.StepColormap(
            colors=list(archetype_color_map.values()),
            index=[i for i in range(len(archetype_color_map))],
            vmin=0, vmax=len(archetype_color_map),
            caption='Most Frequent Job Archetype (Normalized)'
        )
        colormap.add_to(m)

        folium.GeoJson(
            geojson_data,
            name='UK Regions with Normalized Job Archetypes',
            tooltip=GeoJsonTooltip(
                fields=[REGION_NAME_COLUMN, 'Most Frequent Job Archetype'], # Modified line
                aliases=['Region:', 'Most Frequent Job Archetype:'],      # Modified line
                localize=True,
                sticky=False,
                labels=True,
                style="""
                    background-color: #F0EFEF;
                    color: #333333;
                    font-family: sans-serif;
                    font-size: 12px;
                    padding: 8px;
                    border-radius: 5px;
                    box-shadow: 0 2px 5px rgba(0,0,0,0.2);
                """
            ),
            style_function=lambda x: {
                'fillColor': archetype_color_map.get(x['properties']['Most Frequent Job Archetype'], '#cccccc'),
                'color': 'black',
                'weight': 1,
                'fillOpacity': 0.7
            },
            highlight_function=lambda x: {
                'fillColor': '#FFC107',
                'color': 'black',
                'weight': 2,
                'fillOpacity': 0.7
            }
        ).add_to(m)

        folium.LayerControl().add_to(m)

        print("Map created successfully.")
        m.save(OUTPUT_MAP_FILE)
        return f"Interactive map saved to {OUTPUT_MAP_FILE}. Please open this file in your web browser to view the map."

    except FileNotFoundError as e:
        return f"Error: One of the data files was not found. Please ensure all required files ('{JOB_LISTINGS_FILE}', '{POPULATION_FILE}', and '{UK_BOUNDARIES_FILE}') are in the same directory as the script. Details: {e}"
    except json.JSONDecodeError as e:
        return f"Error: Could not parse the GeoJSON file '{uk_boundaries_filepath}'. Please ensure it's valid GeoJSON. Details: {e}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"

# --- Main execution block to generate map ---
if __name__ == '__main__':
    result_message = create_interactive_map(JOB_LISTINGS_FILE, POPULATION_FILE, UK_BOUNDARIES_FILE)
    print(result_message)

Processing job listings data from master_enriched_job_listings_dataset.csv...
Standardized region names: Replaced 'East of England' with 'East' for 271 entries.
Excluded 'Noise / General Business Role' category. Removed 1814 rows.
Processing population data from Population ONS.csv...
Standardized population region names: Replaced 'East of England' with 'East' for 1 entries.
Calculating normalized job archetype counts...
Job archetypes processed using normalized data.
Processing UK boundaries data from uk_boundaries.geojson...
Merging data into GeoJSON properties...
Merged data for 12 GeoJSON features.
Creating Folium map...
Map created successfully.
Interactive map saved to uk_job_archetypes_map_normalized.html. Please open this file in your web browser to view the map.


**Wage heatwave**

In [2]:
import pandas as pd
import io
import json
import folium
from folium.features import GeoJsonTooltip
import sys
import branca.colormap as cm

# --- Configuration ---
REGION_NAME_COLUMN = 'ITL121NM' # Key within GeoJSON feature properties for the region name
SALARY_COLUMN = 'median_annual_salary_gbp' # Column containing salary data
OUTPUT_MAP_FILE = "uk_salary_heatwave_map.html" # The name of the output HTML map file

# Define the filenames for your data files
JOB_LISTINGS_FILE = "master_enriched_job_listings_dataset.csv"
UK_BOUNDARIES_FILE = "uk_boundaries.geojson" # GeoJSON file for UK boundaries


def create_salary_heatwave_map(job_listings_filepath, uk_boundaries_filepath):
    """
    Creates an interactive Folium heatwave map showing the average median annual salary per UK region.

    Args:
        job_listings_filepath (str): The file path to the master_enriched_job_listings.csv file.
        uk_boundaries_filepath (str): The file path to the UK boundaries GeoJSON file.

    Returns:
        str: A message indicating the success or failure and the path to the HTML map file.
    """
    try:
        # --- 1. Process Job Listings Data ---
        print(f"Processing job listings data from {job_listings_filepath}...")
        job_listings_df = pd.read_csv(job_listings_filepath)

        if 'region' not in job_listings_df.columns or SALARY_COLUMN not in job_listings_df.columns:
            raise ValueError(f"Job listings CSV must contain 'region' and '{SALARY_COLUMN}' columns.")

        # --- Standardize Region Names ---
        # Replace "East of England" with "East" in the 'region' column for consistency with GeoJSON
        initial_east_of_england_count = (job_listings_df['region'] == 'East of England').sum()
        job_listings_df['region'] = job_listings_df['region'].replace('East of England', 'East')
        print(f"Standardized region names: Replaced 'East of England' with 'East' for {initial_east_of_england_count} entries.")

        # --- Calculate Average Salary per Region ---
        # Ensure salary column is numeric, coercing errors to NaN
        job_listings_df[SALARY_COLUMN] = pd.to_numeric(job_listings_df[SALARY_COLUMN], errors='coerce')
        # Drop rows where salary is NaN after conversion
        job_listings_df.dropna(subset=[SALARY_COLUMN], inplace=True)

        # Calculate the median annual salary for each region
        average_salaries = job_listings_df.groupby('region')[SALARY_COLUMN].median().reset_index()
        average_salaries.rename(columns={SALARY_COLUMN: 'Average Annual Salary (GBP)'}, inplace=True)
        salary_map = average_salaries.set_index('region')['Average Annual Salary (GBP)'].to_dict()
        print("Average salaries per region calculated.")

        # --- 2. Process UK Boundaries Data from GeoJSON ---
        print(f"Processing UK boundaries data from {uk_boundaries_filepath}...")
        with open(uk_boundaries_filepath, 'r', encoding='utf-8') as f:
            geojson_data = json.load(f)

        if "features" not in geojson_data or not isinstance(geojson_data["features"], list):
            raise ValueError("Invalid GeoJSON structure: 'features' array not found.")

        if not any(REGION_NAME_COLUMN in feature.get("properties", {}) for feature in geojson_data["features"]):
            print(f"Warning: '{REGION_NAME_COLUMN}' not found in properties of any GeoJSON feature. "
                  "Please ensure this is the correct key for region names in your GeoJSON.", file=sys.stderr)

        # --- 3. Merge Salary Data into GeoJSON Properties ---
        print("Merging salary data into GeoJSON properties...")
        processed_features_count = 0
        for feature in geojson_data["features"]:
            if "properties" in feature and REGION_NAME_COLUMN in feature["properties"]:
                region_name = feature["properties"][REGION_NAME_COLUMN]
                # Assign the average salary, defaulting to 0 or NaN if not found
                feature["properties"]["Average Annual Salary (GBP)"] = salary_map.get(region_name, None)
                processed_features_count += 1
            else:
                print(f"Warning: Feature missing 'properties' or '{REGION_NAME_COLUMN}' key. Skipping feature.", file=sys.stderr)

        print(f"Merged data for {processed_features_count} GeoJSON features.")

        # --- 4. Create Interactive Folium Map with Choropleth ---
        print("Creating Folium heatwave map...")
        uk_center = [55.0, -3.0]
        m = folium.Map(location=uk_center, zoom_start=6)

        # Get min and max salary for the color scale
        # Filter out None values before calculating min/max
        salaries_for_scale = [
            feature["properties"]["Average Annual Salary (GBP)"]
            for feature in geojson_data["features"]
            if feature["properties"].get("Average Annual Salary (GBP)") is not None
        ]

        if not salaries_for_scale:
            print("No valid salary data found to create a color scale. Map will not show color variations.", file=sys.stderr)
            min_salary, max_salary = 0, 1 # Default to avoid errors
        else:
            min_salary = min(salaries_for_scale)
            max_salary = max(salaries_for_scale)

        # Create a linear colormap from lighter red to intense red
        # Using 'YlOrRd' colormap from ColorBrewer, which goes from light yellow to dark red
        # We can reverse it if needed, but YlOrRd generally works well for "more is darker"
        colormap = cm.LinearColormap(
            colors=['#fee0d2', '#fc9272', '#fb6a4a', '#ef3b2c', '#cb181d', '#a50f15', '#67000d'],
            vmin=min_salary,
            vmax=max_salary,
            caption='Average Annual Salary (GBP)'
        )
        colormap.add_to(m) # Add the colormap legend to the map

        folium.Choropleth(
            geo_data=geojson_data,
            name='Average Salary Heatmap',
            data=average_salaries,
            columns=['region', 'Average Annual Salary (GBP)'],
            key_on=f'feature.properties.{REGION_NAME_COLUMN}',
            fill_color='YlOrRd', # Use a red-based color scheme
            fill_opacity=0.7,
            line_opacity=0.2,
            legend_name='Average Annual Salary (GBP)',
            highlight=True # Highlight regions on hover
        ).add_to(m)

        # Add tooltips to the Choropleth layer
        folium.GeoJson(
            geojson_data,
            name='Region Info',
            tooltip=GeoJsonTooltip(
                fields=[REGION_NAME_COLUMN, 'Average Annual Salary (GBP)'],
                aliases=['Region:', 'Avg. Salary (GBP):'],
                localize=True,
                sticky=False,
                labels=True,
                style="""
                    background-color: #F0EFEF;
                    color: #333333;
                    font-family: sans-serif;
                    font-size: 12px;
                    padding: 8px;
                    border-radius: 5px;
                    box-shadow: 0 2px 5px rgba(0,0,0,0.2);
                """
            ),
            # Set a transparent style for the GeoJson layer itself, as Choropleth handles fill
            style_function=lambda x: {
                'fillColor': '#ffffff00', # Transparent fill
                'color': 'black',
                'weight': 0.5,
                'fillOpacity': 0
            },
            highlight_function=lambda x: { # Keep highlight for visual feedback on hover
                'fillColor': '#FFC107',
                'color': 'black',
                'weight': 2,
                'fillOpacity': 0.7
            }
        ).add_to(m)


        folium.LayerControl().add_to(m)

        print("Heatwave map created successfully.")
        m.save(OUTPUT_MAP_FILE)
        return f"Interactive heatwave map saved to {OUTPUT_MAP_FILE}. Please open this file in your web browser to view the map."

    except FileNotFoundError as e:
        return f"Error: One of the data files was not found. Please ensure '{JOB_LISTINGS_FILE}' and '{UK_BOUNDARIES_FILE}' are in the same directory as the script. Details: {e}"
    except json.JSONDecodeError as e:
        return f"Error: Could not parse the GeoJSON file '{uk_boundaries_filepath}'. Please ensure it's valid GeoJSON. Details: {e}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"

# --- Main execution block to generate map ---
if __name__ == '__main__':
    result_message = create_salary_heatwave_map(JOB_LISTINGS_FILE, UK_BOUNDARIES_FILE)
    print(result_message)


Processing job listings data from master_enriched_job_listings_dataset.csv...
Standardized region names: Replaced 'East of England' with 'East' for 271 entries.
Average salaries per region calculated.
Processing UK boundaries data from uk_boundaries.geojson...
Merging salary data into GeoJSON properties...
Merged data for 12 GeoJSON features.
Creating Folium heatwave map...
Heatwave map created successfully.
Interactive heatwave map saved to uk_salary_heatwave_map.html. Please open this file in your web browser to view the map.
