# 7. Function Application

### Examples

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load sample datasets
tips = sns.load_dataset('tips')
flights = sns.load_dataset('flights')
titanic = sns.load_dataset('titanic')
iris = sns.load_dataset('iris')

print("Datasets loaded:")
print(f"Tips shape: {tips.shape}")
print(f"Flights shape: {flights.shape}")
print(f"Titanic shape: {titanic.shape}")
print(f"Iris shape: {iris.shape}")

Datasets loaded:
Tips shape: (244, 7)
Flights shape: (144, 3)
Titanic shape: (891, 15)
Iris shape: (150, 5)


In [3]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
# Apply single function to all numeric columns
numeric_cols = tips.select_dtypes(include=[np.number])

print("Mean of numeric columns:")
print(numeric_cols.apply(np.mean))


Mean of numeric columns:
total_bill    19.785943
tip            2.998279
size           2.569672
dtype: float64


In [5]:
print("\nStandard deviation of numeric columns:")
print(numeric_cols.apply(np.std))


Standard deviation of numeric columns:
total_bill    8.884151
tip           1.380800
size          0.949149
dtype: float64


In [6]:
# Apply custom function
def coefficient_of_variation(series):
    """Calculate coefficient of variation (CV = std/mean)"""
    return series.std() / series.mean()

print("\nCoefficient of Variation:")
print(numeric_cols.apply(coefficient_of_variation))


Coefficient of Variation:
total_bill    0.449936
tip           0.461478
size          0.370125
dtype: float64


In [7]:
# Apply lambda function
print("\nRange (max - min):")
print(numeric_cols.apply(lambda x: x.max() - x.min()))


Range (max - min):
total_bill    47.74
tip            9.00
size           5.00
dtype: float64


In [8]:
# Multiple functions at once
print("\nMultiple statistics:")
print(numeric_cols.apply(['mean', 'std', 'min', 'max']))


Multiple statistics:
      total_bill        tip      size
mean   19.785943   2.998279  2.569672
std     8.902412   1.383638  0.951100
min     3.070000   1.000000  1.000000
max    50.810000  10.000000  6.000000


In [9]:
# Apply to each column
column_stats = titanic.apply(lambda x: x.nunique(), axis=0)
print("Unique values per column:")
print(column_stats)


Unique values per column:
survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64


In [10]:
# Multiple functions at once
print("\nMultiple statistics:")
print(numeric_cols.apply(['mean', 'std', 'min', 'max']))


Multiple statistics:
      total_bill        tip      size
mean   19.785943   2.998279  2.569672
std     8.902412   1.383638  0.951100
min     3.070000   1.000000  1.000000
max    50.810000  10.000000  6.000000


### Work Sheet 

In [11]:
file = r'/Users/teslim/TeslimWorkSpace/TheData/The_data_grp1.xlsx'

In [12]:
uk_pg4 = pd.read_excel(file, sheet_name='uk_polls', engine='openpyxl')
uk_pg4.head()

Unnamed: 0,Constituency code,Constituency,Sub-region,Region,GE2019 Implied results: MP name,GE2019 Implied results: Party,GE2019 Implied results: Electorate,GE2019 Implied results: Majority,Poll: Environment/climate change important issues at next election (%),Local green economy: GVA (¬£m),...,Tree canopy cover: Proportion of constituency (%),Tree canopy cover: Neighbourhoods with less than 10% tree canopy (No.),Tree canopy cover: Neighbourhoods with less than 10% tree canopy (%),Households without access to private vehicles (No.),Households without access to private vehicles (%),Bus services: Frequency per hour (2010),Bus services: Frequency per hour (2023),Bus services: Change in frequency of service 2010-2023 (%),Sewage overflows: Total number of spills within constituency,Sewage overflows: Total duration of spills within constituency (hrs)
0,E14001063,Aldershot,"Berkshire, Hampshire, and Surrey",South East,Leo Docherty,CON,77531,18174,0.71,152.4,...,0.31308,7.0,0.101449,7080,0.152219,33.55,9.09,-0.7291,64,366.75
1,E14001064,Aldridge-Brownhills,Staffordshire and the Black Country,West Midlands,Wendy Morton,CON,72512,22282,0.61,54.3,...,0.133292,28.0,0.466667,6663,0.163533,22.72,14.66,-0.3547,96,299.62
2,E14001065,Altrincham and Sale West,Greater Manchester,North West,Graham Brady,CON,73107,6138,0.64,150.1,...,0.135244,22.0,0.372881,6211,0.149217,15.66,9.55,-0.3901,1053,5740.32
3,E14001066,Amber Valley,Derbyshire,East Midlands,Nigel Mills,CON,70045,16897,0.66,53.1,...,0.140897,18.0,0.315789,7478,0.181898,25.5,9.97,-0.609,2322,15596.86
4,E14001067,Arundel and South Downs,East Sussex and West Sussex,South East,Andrew Griffith,CON,76681,22986,0.77,58.6,...,0.281761,6.0,0.101695,4071,0.093275,4.76,2.56,-0.4641,2282,34098.61


In [13]:
import sys 

# Step 1: View Python's default module search paths
print("üîç Current Python Module Search Paths:")
for path in sys.path:
    print(path)
 
# Step 2: Add your custom module directory to sys.path  
custom_module_path = '/Users/teslim/MyModules/teslim_data_utils'

# Check to avoid adding duplicates
if custom_module_path not in sys.path:
    sys.path.append(custom_module_path)
    print(f"\n‚úÖ Custom module path added: {custom_module_path}")
else:
    print(f"\n‚ö†Ô∏è Path already exists: {custom_module_path}")


üîç Current Python Module Search Paths:
/Users/teslim/anaconda3/envs/machine-learning-env/lib/python310.zip
/Users/teslim/anaconda3/envs/machine-learning-env/lib/python3.10
/Users/teslim/anaconda3/envs/machine-learning-env/lib/python3.10/lib-dynload

/Users/teslim/anaconda3/envs/machine-learning-env/lib/python3.10/site-packages
/Users/teslim/anaconda3/envs/machine-learning-env/lib/python3.10/site-packages/setuptools/_vendor

‚úÖ Custom module path added: /Users/teslim/MyModules/teslim_data_utils


In [14]:
import sys
sys.path.append('/Users/teslim/MyModules')

# Now try importing
from teslim_data_utils import (
    clean_column_headers,
    clean_numeric_column, 
    remove_duplicates,
    fill_missing,
    remove_outliers_iqr,
    clean_text_column
)

In [15]:
# Clean messy column names (removes spaces, special chars, etc.)
df_clean = clean_column_headers(uk_pg4)
df_clean.columns.tolist()

['constituency_code',
 'constituency',
 'subregion',
 'region',
 'ge2019_implied_results_mp_name',
 'ge2019_implied_results_party',
 'ge2019_implied_results_electorate',
 'ge2019_implied_results_majority',
 'poll_environmentclimate_change_important_issues_at_next_election',
 'local_green_economy_gva_m',
 'local_green_economy_of_total_gva',
 'local_green_economy_fte_jobs',
 'local_green_economy_fte_jobs_1',
 'energy_crisis_hotspots_neighbourhoods_no',
 'energy_crisis_hotspots_neighbourhoods',
 'energy_crisis_hotspots_average_energy_bill',
 'epcs_homes_in_bands_abc',
 'epcs_homes_in_bands_defg',
 'epcs_homes_in_bands_abc_no',
 'epcs_homes_in_bands_defg_no',
 'epcs_loft_insulation_recommendations',
 'epcs_cavity_wall_insulation_recommendations',
 'epcs_solid_wall_insulation_recommendations',
 'epcs_loft_insulation_recommendations_no',
 'epcs_cavity_insulation_recommendations_no',
 'epcs_solid_insulation_recommendations_no',
 'respiratory_disease_prevalence',
 'air_pollution_neighbourhoods

In [16]:
df_clean

Unnamed: 0,constituency_code,constituency,subregion,region,ge2019_implied_results_mp_name,ge2019_implied_results_party,ge2019_implied_results_electorate,ge2019_implied_results_majority,poll_environmentclimate_change_important_issues_at_next_election,local_green_economy_gva_m,...,tree_canopy_cover_proportion_of_constituency,tree_canopy_cover_neighbourhoods_with_less_than_10_tree_canopy_no,tree_canopy_cover_neighbourhoods_with_less_than_10_tree_canopy,households_without_access_to_private_vehicles_no,households_without_access_to_private_vehicles,bus_services_frequency_per_hour_2010,bus_services_frequency_per_hour_2023,bus_services_change_in_frequency_of_service_20102023,sewage_overflows_total_number_of_spills_within_constituency,sewage_overflows_total_duration_of_spills_within_constituency_hrs
0,E14001063,Aldershot,"Berkshire, Hampshire, and Surrey",South East,Leo Docherty,CON,77531,18174,0.71,152.4,...,0.313080,7.0,0.101449,7080,0.152219,33.55,9.09,-0.7291,64,366.75
1,E14001064,Aldridge-Brownhills,Staffordshire and the Black Country,West Midlands,Wendy Morton,CON,72512,22282,0.61,54.3,...,0.133292,28.0,0.466667,6663,0.163533,22.72,14.66,-0.3547,96,299.62
2,E14001065,Altrincham and Sale West,Greater Manchester,North West,Graham Brady,CON,73107,6138,0.64,150.1,...,0.135244,22.0,0.372881,6211,0.149217,15.66,9.55,-0.3901,1053,5740.32
3,E14001066,Amber Valley,Derbyshire,East Midlands,Nigel Mills,CON,70045,16897,0.66,53.1,...,0.140897,18.0,0.315789,7478,0.181898,25.50,9.97,-0.6090,2322,15596.86
4,E14001067,Arundel and South Downs,East Sussex and West Sussex,South East,Andrew Griffith,CON,76681,22986,0.77,58.6,...,0.281761,6.0,0.101695,4071,0.093275,4.76,2.56,-0.4641,2282,34098.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570,W07000108,Swansea West,Wales,Wales,Geraint Davies,LAB,73780,11508,0.68,79.4,...,,,,14476,0.312346,32.89,21.40,-0.3493,998,4209.50
571,W07000109,Torfaen,Wales,Wales,Nick Thomas-Symonds,LAB,70218,3392,0.62,72.3,...,,,,7897,0.196248,22.95,10.39,-0.5469,3182,19765.50
572,W07000110,Vale of Glamorgan,Wales,Wales,Alun Cairns,CON,70148,2666,0.72,37.6,...,,,,7028,0.171089,17.32,8.17,-0.5284,2049,25562.50
573,W07000111,Wrexham,Wales,Wales,Sarah Atherton,CON,70870,4091,0.58,143.0,...,,,,8035,0.190016,19.72,11.34,-0.4250,1665,10547.75


In [17]:
df_clean = clean_text_column(df_clean)

TypeError: clean_text_column() missing 1 required positional argument: 'column_name'

In [43]:
def change_column_header(df):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(" ", "_")
    df.columns = df.columns.str.strip()
    return df


# calling the function 
change_column_header(uk_pg4)

Unnamed: 0,constituency_code,constituency,sub-region,region,ge2019_implied_results:_mp_name,ge2019_implied_results:_party,ge2019_implied_results:_electorate,ge2019_implied_results:_majority,poll:_environment/climate_change_important_issues_at_next_election_(%),local_green_economy:_gva_(¬£m),local_green_economy:_%_of_total_gva,local_green_economy:_fte_jobs,local_green_economy:_fte_jobs_(%),energy_crisis_hotspots:_neighbourhoods_(no.),energy_crisis_hotspots:_neighbourhoods_(%),energy_crisis_hotspots:_average_energy_bill_(¬£),epcs:_homes_in_bands_a/b/c_(%),epcs:_homes_in_bands_d/e/f/g_(%),epcs:_homes_in_bands_a/b/c_(no.),epcs:_homes_in_bands_d/e/f/g_(no.),epcs:_loft_insulation_recommendations_(%),epcs:_cavity_wall_insulation_recommendations_(%),epcs:_solid_wall_insulation_recommendations_(%),epcs:_loft_insulation_recommendations_(no.),epcs:_cavity_insulation_recommendations_(no.),epcs:_solid_insulation_recommendations_(no.),respiratory_disease_prevalence_(%),air_pollution:_neighbourhoods_where_no2_above_who_guidelines_(no.),air_pollution:_neighbourhoods_where_pm2.5_above_who_guidelines_(no.),air_pollution:_neighbourhoods_where_no2_above_who_guidelines_(%),air_pollution:_neighbourhoods_where_pm2.5_above_who_guidelines_(%),extreme_heat:_neighbourhoods_at_risk_(no.),extreme_heat:_neighbourhoods_at_risk_(%),flooding:_neighbourhoods_at_risk_(no.),flooding:_neighbourhoods_at_risk_(%),accessible_green_space:_m2_per_person,accessible_green_space:_rating_(compared_across_all_constituencies),tree_canopy_cover:_hectares,tree_canopy_cover:_proportion_of_constituency_(%),tree_canopy_cover:_neighbourhoods_with_less_than_10%_tree_canopy_(no.),tree_canopy_cover:_neighbourhoods_with_less_than_10%_tree_canopy_(%),households_without_access_to_private_vehicles_(no.),households_without_access_to_private_vehicles_(%),bus_services:_frequency_per_hour_(2010),bus_services:_frequency_per_hour_(2023),bus_services:_change_in_frequency_of_service_2010-2023_(%),sewage_overflows:_total_number_of_spills_within_constituency,sewage_overflows:_total_duration_of_spills_within_constituency_(hrs)
0,E14001063,Aldershot,"Berkshire, Hampshire, and Surrey",South East,Leo Docherty,CON,77531,18174,0.71,152.4,0.0223,1675.4,0.0339,12,0.173913,2490.0,0.505071,0.494929,24264,23777,0.173847,0.201805,0.117448,8350,9690,5640,0.076794,61,69,0.884058,1.000000,69,1.000000,69,1.000000,40.4,reasonable,1787.406293,0.313080,7.0,0.101449,7080,0.152219,33.55,9.09,-0.7291,64,366.75
1,E14001064,Aldridge-Brownhills,Staffordshire and the Black Country,West Midlands,Wendy Morton,CON,72512,22282,0.61,54.3,0.0314,789.7,0.0251,25,0.403226,2560.0,0.378564,0.621359,15372,25232,0.227811,0.232959,0.129371,9250,9460,5250,0.093871,60,60,1.000000,1.000000,2,0.033333,60,1.000000,37.0,reasonable,709.509779,0.133292,28.0,0.466667,6663,0.163533,22.72,14.66,-0.3547,96,299.62
2,E14001065,Altrincham and Sale West,Greater Manchester,North West,Graham Brady,CON,73107,6138,0.64,150.1,0.0376,1871.1,0.0309,5,0.083333,2540.0,0.368482,0.631518,15534,26624,0.190543,0.273677,0.175645,8030,11540,7400,0.087378,58,59,0.983051,1.000000,0,0.000000,53,0.898305,13.9,poor,688.806604,0.135244,22.0,0.372881,6211,0.149217,15.66,9.55,-0.3901,1053,5740.32
3,E14001066,Amber Valley,Derbyshire,East Midlands,Nigel Mills,CON,70045,16897,0.66,53.1,0.0265,772.2,0.0249,36,0.620690,2480.0,0.381681,0.618319,16103,26087,0.166121,0.116682,0.289384,7010,4920,12210,0.098224,12,57,0.210526,1.000000,4,0.070175,57,1.000000,61.5,good,1759.004203,0.140897,18.0,0.315789,7478,0.181898,25.50,9.97,-0.6090,2322,15596.86
4,E14001067,Arundel and South Downs,East Sussex and West Sussex,South East,Andrew Griffith,CON,76681,22986,0.77,58.6,0.0252,825.1,0.0249,7,0.116667,2520.0,0.368691,0.631309,16822,28803,0.248449,0.236407,0.157492,11340,10790,7190,0.084989,0,59,0.000000,1.000000,47,0.810345,58,1.000000,467.5,good,23545.789447,0.281761,6.0,0.101695,4071,0.093275,4.76,2.56,-0.4641,2282,34098.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570,W07000108,Swansea West,Wales,Wales,Geraint Davies,LAB,73780,11508,0.68,79.4,0.0313,916.6,0.0230,12,0.179104,2370.0,0.391266,0.608734,19663,30592,0.226403,0.262919,0.299667,11380,13210,15060,,1,67,0.014925,1.000000,0,0.000000,61,0.910448,,,,,,,14476,0.312346,32.89,21.40,-0.3493,998,4209.50
571,W07000109,Torfaen,Wales,Wales,Nick Thomas-Symonds,LAB,70218,3392,0.62,72.3,0.0422,754.1,0.0239,25,0.416667,2390.0,0.495972,0.504028,21098,21440,0.186824,0.201707,0.150708,7950,8580,6410,,0,60,0.000000,1.000000,0,0.000000,57,0.950000,,,,,,,7897,0.196248,22.95,10.39,-0.5469,3182,19765.50
572,W07000110,Vale of Glamorgan,Wales,Wales,Alun Cairns,CON,70148,2666,0.72,37.6,0.0258,539.4,0.0227,8,0.142857,2350.0,0.467042,0.532958,20225,23079,0.175515,0.188311,0.214508,7600,8150,9290,,2,56,0.035714,1.000000,0,0.000000,48,0.857143,,,,,,,7028,0.171089,17.32,8.17,-0.5284,2049,25562.50
573,W07000111,Wrexham,Wales,Wales,Sarah Atherton,CON,70870,4091,0.58,143.0,0.0569,1388.6,0.0351,18,0.285714,2430.0,0.443197,0.556803,19541,24551,0.141807,0.116747,0.196037,6250,5150,8640,,0,61,0.000000,0.968254,0,0.000000,60,0.952381,,,,,,,8035,0.190016,19.72,11.34,-0.4250,1665,10547.75


In [44]:
uk_pg4.columns.to_list()

['constituency_code',
 'constituency',
 'sub-region',
 'region',
 'ge2019_implied_results:_mp_name',
 'ge2019_implied_results:_party',
 'ge2019_implied_results:_electorate',
 'ge2019_implied_results:_majority',
 'poll:_environment/climate_change_important_issues_at_next_election_(%)',
 'local_green_economy:_gva_(¬£m)',
 'local_green_economy:_%_of_total_gva',
 'local_green_economy:_fte_jobs',
 'local_green_economy:_fte_jobs_(%)',
 'energy_crisis_hotspots:_neighbourhoods_(no.)',
 'energy_crisis_hotspots:_neighbourhoods_(%)',
 'energy_crisis_hotspots:_average_energy_bill_(¬£)',
 'epcs:_homes_in_bands_a/b/c_(%)',
 'epcs:_homes_in_bands_d/e/f/g_(%)',
 'epcs:_homes_in_bands_a/b/c_(no.)',
 'epcs:_homes_in_bands_d/e/f/g_(no.)',
 'epcs:_loft_insulation_recommendations_(%)',
 'epcs:_cavity_wall_insulation_recommendations_(%)',
 'epcs:_solid_wall_insulation_recommendations_(%)',
 'epcs:_loft_insulation_recommendations_(no.)',
 'epcs:_cavity_insulation_recommendations_(no.)',
 'epcs:_solid_insulat

In [45]:
uk_pg4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 575 entries, 0 to 574
Data columns (total 48 columns):
 #   Column                                                                  Non-Null Count  Dtype  
---  ------                                                                  --------------  -----  
 0   constituency_code                                                       575 non-null    object 
 1   constituency                                                            575 non-null    object 
 2   sub-region                                                              575 non-null    object 
 3   region                                                                  575 non-null    object 
 4   ge2019_implied_results:_mp_name                                         575 non-null    object 
 5   ge2019_implied_results:_party                                           575 non-null    object 
 6   ge2019_implied_results:_electorate                                      575 non-nu

In [46]:
numerical_columns = uk_pg4.select_dtypes(include=['float64', 'int64'])
categorical_columns = uk_pg4.select_dtypes(exclude=['float64', 'int64'])

In [47]:
numerical_columns

Unnamed: 0,ge2019_implied_results:_electorate,ge2019_implied_results:_majority,poll:_environment/climate_change_important_issues_at_next_election_(%),local_green_economy:_gva_(¬£m),local_green_economy:_%_of_total_gva,local_green_economy:_fte_jobs,local_green_economy:_fte_jobs_(%),energy_crisis_hotspots:_neighbourhoods_(no.),energy_crisis_hotspots:_neighbourhoods_(%),energy_crisis_hotspots:_average_energy_bill_(¬£),epcs:_homes_in_bands_a/b/c_(%),epcs:_homes_in_bands_d/e/f/g_(%),epcs:_homes_in_bands_a/b/c_(no.),epcs:_homes_in_bands_d/e/f/g_(no.),epcs:_loft_insulation_recommendations_(%),epcs:_cavity_wall_insulation_recommendations_(%),epcs:_solid_wall_insulation_recommendations_(%),epcs:_loft_insulation_recommendations_(no.),epcs:_cavity_insulation_recommendations_(no.),epcs:_solid_insulation_recommendations_(no.),respiratory_disease_prevalence_(%),air_pollution:_neighbourhoods_where_no2_above_who_guidelines_(no.),air_pollution:_neighbourhoods_where_pm2.5_above_who_guidelines_(no.),air_pollution:_neighbourhoods_where_no2_above_who_guidelines_(%),air_pollution:_neighbourhoods_where_pm2.5_above_who_guidelines_(%),extreme_heat:_neighbourhoods_at_risk_(no.),extreme_heat:_neighbourhoods_at_risk_(%),flooding:_neighbourhoods_at_risk_(no.),flooding:_neighbourhoods_at_risk_(%),accessible_green_space:_m2_per_person,tree_canopy_cover:_hectares,tree_canopy_cover:_proportion_of_constituency_(%),tree_canopy_cover:_neighbourhoods_with_less_than_10%_tree_canopy_(no.),tree_canopy_cover:_neighbourhoods_with_less_than_10%_tree_canopy_(%),households_without_access_to_private_vehicles_(no.),households_without_access_to_private_vehicles_(%),bus_services:_frequency_per_hour_(2010),bus_services:_frequency_per_hour_(2023),bus_services:_change_in_frequency_of_service_2010-2023_(%),sewage_overflows:_total_number_of_spills_within_constituency,sewage_overflows:_total_duration_of_spills_within_constituency_(hrs)
0,77531,18174,0.71,152.4,0.0223,1675.4,0.0339,12,0.173913,2490.0,0.505071,0.494929,24264,23777,0.173847,0.201805,0.117448,8350,9690,5640,0.076794,61,69,0.884058,1.000000,69,1.000000,69,1.000000,40.4,1787.406293,0.313080,7.0,0.101449,7080,0.152219,33.55,9.09,-0.7291,64,366.75
1,72512,22282,0.61,54.3,0.0314,789.7,0.0251,25,0.403226,2560.0,0.378564,0.621359,15372,25232,0.227811,0.232959,0.129371,9250,9460,5250,0.093871,60,60,1.000000,1.000000,2,0.033333,60,1.000000,37.0,709.509779,0.133292,28.0,0.466667,6663,0.163533,22.72,14.66,-0.3547,96,299.62
2,73107,6138,0.64,150.1,0.0376,1871.1,0.0309,5,0.083333,2540.0,0.368482,0.631518,15534,26624,0.190543,0.273677,0.175645,8030,11540,7400,0.087378,58,59,0.983051,1.000000,0,0.000000,53,0.898305,13.9,688.806604,0.135244,22.0,0.372881,6211,0.149217,15.66,9.55,-0.3901,1053,5740.32
3,70045,16897,0.66,53.1,0.0265,772.2,0.0249,36,0.620690,2480.0,0.381681,0.618319,16103,26087,0.166121,0.116682,0.289384,7010,4920,12210,0.098224,12,57,0.210526,1.000000,4,0.070175,57,1.000000,61.5,1759.004203,0.140897,18.0,0.315789,7478,0.181898,25.50,9.97,-0.6090,2322,15596.86
4,76681,22986,0.77,58.6,0.0252,825.1,0.0249,7,0.116667,2520.0,0.368691,0.631309,16822,28803,0.248449,0.236407,0.157492,11340,10790,7190,0.084989,0,59,0.000000,1.000000,47,0.810345,58,1.000000,467.5,23545.789447,0.281761,6.0,0.101695,4071,0.093275,4.76,2.56,-0.4641,2282,34098.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570,73780,11508,0.68,79.4,0.0313,916.6,0.0230,12,0.179104,2370.0,0.391266,0.608734,19663,30592,0.226403,0.262919,0.299667,11380,13210,15060,,1,67,0.014925,1.000000,0,0.000000,61,0.910448,,,,,,14476,0.312346,32.89,21.40,-0.3493,998,4209.50
571,70218,3392,0.62,72.3,0.0422,754.1,0.0239,25,0.416667,2390.0,0.495972,0.504028,21098,21440,0.186824,0.201707,0.150708,7950,8580,6410,,0,60,0.000000,1.000000,0,0.000000,57,0.950000,,,,,,7897,0.196248,22.95,10.39,-0.5469,3182,19765.50
572,70148,2666,0.72,37.6,0.0258,539.4,0.0227,8,0.142857,2350.0,0.467042,0.532958,20225,23079,0.175515,0.188311,0.214508,7600,8150,9290,,2,56,0.035714,1.000000,0,0.000000,48,0.857143,,,,,,7028,0.171089,17.32,8.17,-0.5284,2049,25562.50
573,70870,4091,0.58,143.0,0.0569,1388.6,0.0351,18,0.285714,2430.0,0.443197,0.556803,19541,24551,0.141807,0.116747,0.196037,6250,5150,8640,,0,61,0.000000,0.968254,0,0.000000,60,0.952381,,,,,,8035,0.190016,19.72,11.34,-0.4250,1665,10547.75


In [48]:
numerical_columns.apply([max, min], axis=0).T

Unnamed: 0,max,min
ge2019_implied_results:_electorate,92578.0,51927.0
ge2019_implied_results:_majority,34557.0,27.0
poll:_environment/climate_change_important_issues_at_next_election_(%),0.84,0.52
local_green_economy:_gva_(¬£m),3095.9,23.1
local_green_economy:_%_of_total_gva,0.1586,0.0124
local_green_economy:_fte_jobs,29257.8,404.0
local_green_economy:_fte_jobs_(%),0.0599,0.0162
energy_crisis_hotspots:_neighbourhoods_(no.),69.0,0.0
energy_crisis_hotspots:_neighbourhoods_(%),0.884615,0.0
energy_crisis_hotspots:_average_energy_bill_(¬£),2920.0,2270.0


In [49]:
numerical_columns.apply(np.std, axis=0)

ge2019_implied_results:_electorate                                         3075.604101
ge2019_implied_results:_majority                                           7987.052680
poll:_environment/climate_change_important_issues_at_next_election_(%)        0.052248
local_green_economy:_gva_(¬£m)                                               138.540602
local_green_economy:_%_of_total_gva                                           0.016613
local_green_economy:_fte_jobs                                              1282.375263
local_green_economy:_fte_jobs_(%)                                             0.005648
energy_crisis_hotspots:_neighbourhoods_(no.)                                 12.559046
energy_crisis_hotspots:_neighbourhoods_(%)                                    0.180597
energy_crisis_hotspots:_average_energy_bill_(¬£)                             109.306441
epcs:_homes_in_bands_a/b/c_(%)                                                0.077742
epcs:_homes_in_bands_d/e/f/g_(%)         

In [50]:
column_stats = categorical_columns.apply(lambda x: x.nunique(), axis=0)
column_stats

constituency_code                                                      575
constituency                                                           575
sub-region                                                              40
region                                                                  10
ge2019_implied_results:_mp_name                                        541
ge2019_implied_results:_party                                            9
accessible_green_space:_rating_(compared_across_all_constituencies)      3
dtype: int64

In [51]:
def coefficient_of_variation(col):
    result = col.std() / col.mean()
    return col.std(), col.mean(), result

In [52]:
numerical_columns.apply(coefficient_of_variation, axis=0).T.head()

Unnamed: 0,0,1,2
ge2019_implied_results:_electorate,3078.282033,73386.533913,0.041946
ge2019_implied_results:_majority,7994.007015,13133.156522,0.608689
poll:_environment/climate_change_important_issues_at_next_election_(%),0.052293,0.699,0.074812
local_green_economy:_gva_(¬£m),138.66123,100.933565,1.373787
local_green_economy:_%_of_total_gva,0.016627,0.034859,0.476991


In [53]:
numerical_columns


Unnamed: 0,ge2019_implied_results:_electorate,ge2019_implied_results:_majority,poll:_environment/climate_change_important_issues_at_next_election_(%),local_green_economy:_gva_(¬£m),local_green_economy:_%_of_total_gva,local_green_economy:_fte_jobs,local_green_economy:_fte_jobs_(%),energy_crisis_hotspots:_neighbourhoods_(no.),energy_crisis_hotspots:_neighbourhoods_(%),energy_crisis_hotspots:_average_energy_bill_(¬£),epcs:_homes_in_bands_a/b/c_(%),epcs:_homes_in_bands_d/e/f/g_(%),epcs:_homes_in_bands_a/b/c_(no.),epcs:_homes_in_bands_d/e/f/g_(no.),epcs:_loft_insulation_recommendations_(%),epcs:_cavity_wall_insulation_recommendations_(%),epcs:_solid_wall_insulation_recommendations_(%),epcs:_loft_insulation_recommendations_(no.),epcs:_cavity_insulation_recommendations_(no.),epcs:_solid_insulation_recommendations_(no.),respiratory_disease_prevalence_(%),air_pollution:_neighbourhoods_where_no2_above_who_guidelines_(no.),air_pollution:_neighbourhoods_where_pm2.5_above_who_guidelines_(no.),air_pollution:_neighbourhoods_where_no2_above_who_guidelines_(%),air_pollution:_neighbourhoods_where_pm2.5_above_who_guidelines_(%),extreme_heat:_neighbourhoods_at_risk_(no.),extreme_heat:_neighbourhoods_at_risk_(%),flooding:_neighbourhoods_at_risk_(no.),flooding:_neighbourhoods_at_risk_(%),accessible_green_space:_m2_per_person,tree_canopy_cover:_hectares,tree_canopy_cover:_proportion_of_constituency_(%),tree_canopy_cover:_neighbourhoods_with_less_than_10%_tree_canopy_(no.),tree_canopy_cover:_neighbourhoods_with_less_than_10%_tree_canopy_(%),households_without_access_to_private_vehicles_(no.),households_without_access_to_private_vehicles_(%),bus_services:_frequency_per_hour_(2010),bus_services:_frequency_per_hour_(2023),bus_services:_change_in_frequency_of_service_2010-2023_(%),sewage_overflows:_total_number_of_spills_within_constituency,sewage_overflows:_total_duration_of_spills_within_constituency_(hrs)
0,77531,18174,0.71,152.4,0.0223,1675.4,0.0339,12,0.173913,2490.0,0.505071,0.494929,24264,23777,0.173847,0.201805,0.117448,8350,9690,5640,0.076794,61,69,0.884058,1.000000,69,1.000000,69,1.000000,40.4,1787.406293,0.313080,7.0,0.101449,7080,0.152219,33.55,9.09,-0.7291,64,366.75
1,72512,22282,0.61,54.3,0.0314,789.7,0.0251,25,0.403226,2560.0,0.378564,0.621359,15372,25232,0.227811,0.232959,0.129371,9250,9460,5250,0.093871,60,60,1.000000,1.000000,2,0.033333,60,1.000000,37.0,709.509779,0.133292,28.0,0.466667,6663,0.163533,22.72,14.66,-0.3547,96,299.62
2,73107,6138,0.64,150.1,0.0376,1871.1,0.0309,5,0.083333,2540.0,0.368482,0.631518,15534,26624,0.190543,0.273677,0.175645,8030,11540,7400,0.087378,58,59,0.983051,1.000000,0,0.000000,53,0.898305,13.9,688.806604,0.135244,22.0,0.372881,6211,0.149217,15.66,9.55,-0.3901,1053,5740.32
3,70045,16897,0.66,53.1,0.0265,772.2,0.0249,36,0.620690,2480.0,0.381681,0.618319,16103,26087,0.166121,0.116682,0.289384,7010,4920,12210,0.098224,12,57,0.210526,1.000000,4,0.070175,57,1.000000,61.5,1759.004203,0.140897,18.0,0.315789,7478,0.181898,25.50,9.97,-0.6090,2322,15596.86
4,76681,22986,0.77,58.6,0.0252,825.1,0.0249,7,0.116667,2520.0,0.368691,0.631309,16822,28803,0.248449,0.236407,0.157492,11340,10790,7190,0.084989,0,59,0.000000,1.000000,47,0.810345,58,1.000000,467.5,23545.789447,0.281761,6.0,0.101695,4071,0.093275,4.76,2.56,-0.4641,2282,34098.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570,73780,11508,0.68,79.4,0.0313,916.6,0.0230,12,0.179104,2370.0,0.391266,0.608734,19663,30592,0.226403,0.262919,0.299667,11380,13210,15060,,1,67,0.014925,1.000000,0,0.000000,61,0.910448,,,,,,14476,0.312346,32.89,21.40,-0.3493,998,4209.50
571,70218,3392,0.62,72.3,0.0422,754.1,0.0239,25,0.416667,2390.0,0.495972,0.504028,21098,21440,0.186824,0.201707,0.150708,7950,8580,6410,,0,60,0.000000,1.000000,0,0.000000,57,0.950000,,,,,,7897,0.196248,22.95,10.39,-0.5469,3182,19765.50
572,70148,2666,0.72,37.6,0.0258,539.4,0.0227,8,0.142857,2350.0,0.467042,0.532958,20225,23079,0.175515,0.188311,0.214508,7600,8150,9290,,2,56,0.035714,1.000000,0,0.000000,48,0.857143,,,,,,7028,0.171089,17.32,8.17,-0.5284,2049,25562.50
573,70870,4091,0.58,143.0,0.0569,1388.6,0.0351,18,0.285714,2430.0,0.443197,0.556803,19541,24551,0.141807,0.116747,0.196037,6250,5150,8640,,0,61,0.000000,0.968254,0,0.000000,60,0.952381,,,,,,8035,0.190016,19.72,11.34,-0.4250,1665,10547.75


In [54]:
def change_columns(col):
    a = col.nlargest(5)
    b = col.nsmallest(6)
    
    return a, b 

In [55]:
ratio  = numerical_columns.apply(change_columns, axis=0)
ratio.T


Unnamed: 0,0,1
ge2019_implied_results:_electorate,296 92578 335 82314 36 81511 196 ...,574 51927 241 55555 42 56696 240 ...
ge2019_implied_results:_majority,50 34557 275 34511 276 33052 254 ...,502 27 523 64 545 151 118 207 12...
poll:_environment/climate_change_important_issues_at_next_election_(%),233 0.84 88 0.81 346 0.81 544 0.8...,192 0.52 511 0.53 79 0.54 399 0.5...
local_green_economy:_gva_(¬£m),109 3095.9 372 553.9 503 464.7 227 ...,269 23.1 268 23.5 91 27.5 189 31....
local_green_economy:_%_of_total_gva,95 0.1586 214 0.1586 454 0.1586 225 ...,493 0.0124 382 0.0130 167 0.0137 249 ...
local_green_economy:_fte_jobs,109 29257.8 372 6030.9 247 4326.2 2...,269 404.0 268 410.0 91 431.6 189 ...
local_green_economy:_fte_jobs_(%),95 0.0599 214 0.0599 454 0.0599 212 ...,42 0.0162 268 0.0190 269 0.0190 314 ...
energy_crisis_hotspots:_neighbourhoods_(no.),30 69 35 69 32 65 37 62 569 ...,427 0 471 0 22 1 68 1 157 1 5...
energy_crisis_hotspots:_neighbourhoods_(%),30 0.884615 35 0.884615 37 0.86111...,427 0.000000 471 0.000000 542 0.01587...
energy_crisis_hotspots:_average_energy_bill_(¬£),426 2920.0 434 2920.0 152 2910.0 342 ...,22 2270.0 519 2270.0 169 2290.0 297 ...
