In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [13]:
base_link='https://election.news.sky.com/elections/general-election-2024/'

In [10]:
def json_to_dataframe(data, i):
    json_data=data['props']['pageProps']['fallbackData']['votingArea']['nominations']
    # Normalize JSON data into a DataFrame
    df = pd.json_normalize(json_data, sep='_')
    
    # Extract values and assign to DataFrame columns
    df['constituency'] = data['props']['pageProps']['components']['analytics']['name']
    df['party_name'] = df['party_name'].str.replace('\r\n', '')
    df['Electorate_size'] = data['props']['pageProps']['fallbackData']['votingArea']['electorate']
    df['Turnout_perc'] = data['props']['pageProps']['fallbackData']['votingArea']['turnoutPercentage']
    df['new_party'] = data['props']['pageProps']['fallbackData']['votingArea']['electedCandidate']['party']['name']
    df['old_party'] = data['props']['pageProps']['fallbackData']['votingArea']['previouslyElectedCandidate']['party']['name']
    df['index'] = i

    return df

In [11]:
columns = ['voteCount', 'votePercentage', 'votePercentageChange',
           'candidate_firstName', 'candidate_surname', 'party_abbreviation',
           'party_name', 'party_displayName', 'constituency', 'Electorate_size',
           'Turnout_perc', 'new_party', 'old_party', 'index']

df = pd.DataFrame(columns=columns)

In [20]:
for i in range(1, 651):
    url = base_link + str(i)

    response = requests.get(url)

    if response.status_code == 200:
        html_content = response.content
        
        soup = BeautifulSoup(html_content, 'html.parser')
        
        script_tag = soup.find('script', id='__NEXT_DATA__')
        
        if script_tag:
            json_data = script_tag.string
            
            data_dict = json.loads(json_data)
            
            temp = json_to_dataframe(data_dict, i)
            
            df = pd.merge(df, temp, how='outer')
        else:
            print(f"Script tag with id='__NEXT_DATA__' not found on {url}")
    else:
        print(f"Failed to retrieve data from URL: {url}. Status code: {response.status_code}")


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4515 entries, 0 to 4514
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   voteCount             4515 non-null   int64  
 1   votePercentage        4515 non-null   float64
 2   votePercentageChange  4515 non-null   float64
 3   candidate_firstName   4515 non-null   object 
 4   candidate_surname     4515 non-null   object 
 5   party_abbreviation    4515 non-null   object 
 6   party_name            4515 non-null   object 
 7   party_displayName     4515 non-null   object 
 8   constituency          4515 non-null   object 
 9   Electorate_size       4515 non-null   int64  
 10  Turnout_perc          4515 non-null   float64
 11  new_party             4515 non-null   object 
 12  old_party             4515 non-null   object 
 13  index                 4515 non-null   int64  
dtypes: float64(3), int64(3), object(8)
memory usage: 494.0+ KB


In [140]:
df['constituency'].unique()

array(['Aberafan Maesteg', 'Aberdeen North', 'Aberdeen South',
       'Aberdeenshire N & Moray E', 'Aberdeenshire W & Kincardine',
       'Airdrie & Shotts', 'Aldershot', 'Aldridge-Brownhills',
       'Alloa & Grangemouth', 'Altrincham & Sale West',
       'Alyn and Deeside', 'Amber Valley', 'Angus & Perthshire Glens',
       'Antrim East', 'Antrim North', 'Antrim South',
       'Arbroath & Broughty Ferry', 'Argyll, Bute & S Lochaber',
       'Arundel & South Downs', 'Ashfield', 'Ashford',
       'Ashton-under-Lyne', 'Aylesbury', 'Ayr, Carrick & Cumnock',
       'Ayrshire Central', 'Ayrshire North & Arran', 'Banbury',
       'Bangor Aberconwy', 'Barking', 'Barnsley North', 'Barnsley South',
       'Barrow & Furness', 'Basildon & Billericay',
       'Basildon South & E Thurrock', 'Basingstoke', 'Bassetlaw', 'Bath',
       'Bathgate & Linlithgow', 'Battersea', 'Beaconsfield',
       'Beckenham & Penge', 'Bedford', 'Bedfordshire Mid',
       'Bedfordshire North', 'Belfast East', 'Belfast 

In [23]:
df['constituency'].nunique()

650

In [142]:
df.sample(5)

Unnamed: 0,voteCount,votePercentage,votePercentageChange,candidate_firstName,candidate_surname,party_abbreviation,party_name,party_displayName,constituency,Electorate_size,Turnout_perc,new_party,old_party
2507,3826,9.145015,5.808735,Sharmen,Rahman,Green,Green,Green,Leicester South,70867,59.035941,Independent,Labour
1193,2151,4.557493,2.66292,Te Ata,Browne,Green,Green,Green,Crewe & Nantwich,78423,60.182599,Labour,Conservative
3068,1336,3.81638,0.063795,David,Schmitz,LD,Liberal Democrat,Lib Dem,Nottingham North & Kimberley,73768,47.455536,Labour,Labour
1934,1036,2.829132,-2.094683,John,Lawson,LD,Liberal Democrat,Lib Dem,Great Grimsby & Cleethorpes,77050,47.526282,Labour,Conservative
3847,650,1.519757,1.519757,Alexander,Bramham,Soc Dem,Social Democratic Party,SDP,"Stone, Gt Wyrley & Penkridge",69378,61.647785,Conservative,Conservative
