In [1]:
import pandas as pd

In [2]:

file_path = 'data/Male_Commuting_data.xlsx'
all_sheets = pd.ExcelFile(file_path)  

In [3]:
flag_sheets = {f"Flags {i}" for i in range(1, 126)}

In [7]:
exclude = {"summary", "structure", "Tabellenblatt2"}
exclude.update(flag_sheets)

In [8]:
cleaned_sheets = {}


for sheet_name in all_sheets.sheet_names:
    if sheet_name.strip().lower() in {s.lower() for s in exclude}:
        continue

   
    df = pd.read_excel(all_sheets, sheet_name=sheet_name)
    df = df.iloc[11:].reset_index(drop=True)     
    df.columns = df.iloc[0]                      
    df = df[1:].reset_index(drop=True)           

    cleaned_sheets[sheet_name] = df

In [9]:
sheet = list(cleaned_sheets.keys())[0]
print(f"Sheet: {sheet}")
print(cleaned_sheets[sheet].head())

Sheet: Sheet 1
0                          DURATION (Labels) Zero minutes From 1 to 9 minutes  \
0                               GEO (Labels)          NaN                 NaN   
1  European Union - 27 countries (from 2020)        895.3              3714.5   
2  European Union - 28 countries (2013-2020)       1090.6              4328.2   
3      Euro area - 19 countries  (2015-2022)        624.2              3057.2   
4                                    Belgium         48.4                71.9   

0 From 1 to 14 minutes 1 minute or over From 10 to 14 minutes  \
0                  NaN              NaN                   NaN   
1               7822.7          28722.4                4108.2   
2               9170.1          34294.9                  4842   
3               6256.7          21383.1                3199.5   
4                161.5            744.3                  89.6   

0 From 15 to 19 minutes From 15 to 29 minutes From 20 to 29 minutes  \
0                   NaN             

In [10]:
education_levels = {
    "All ISCED 2011 levels": [
        *range(1, 6), *range(26, 31), *range(51, 56), *range(76, 81), *range(101, 106)
    ],
    "ISCED 0–2": [
        *range(6, 11), *range(31, 36), *range(56, 61), *range(81, 86), *range(106, 111)
    ],
    "ISCED 3–4": [
        *range(11, 16), *range(36, 41), *range(61, 66), *range(86, 91), *range(111, 116)
    ],
    "ISCED 5–8": [
        *range(16, 21), *range(41, 46), *range(66, 71), *range(91, 96), *range(116, 121)
    ],
    "No response": [
        *range(21, 26), *range(46, 51), *range(71, 76), *range(96, 101), *range(121, 126)
    ],
}

degree_of_urbanisation = {
    "Total": list(range(1, 26)),
    "Cities": list(range(26, 51)),
    "Towns and suburbs": list(range(51, 76)),
    "Rural areas": list(range(76, 101)),
    "No response": list(range(101, 126)),
}

age_class = {
    "From 15 to 34 years": [1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61, 66, 71, 76, 81, 86, 91, 96, 101, 106, 111, 116, 121],
    "From 15 to 74 years": [2, 7, 12, 17, 22, 27, 32, 37, 42, 47, 52, 57, 62, 67, 72, 77, 82, 87, 92, 97, 102, 107, 112, 117, 122],
    "From 20 to 64 years": [3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58, 63, 68, 73, 78, 83, 88, 93, 98, 103, 108, 113, 118, 123],
    "From 35 to 49 years": [4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 99, 104, 109, 114, 119, 124],
    "From 50 to 74 years": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125],
}


In [11]:
def lookup_category(index, mapping):
    for label, index_list in mapping.items():
        if index in index_list:
            return label
    return None

In [12]:
import re

def extract_sheet_number(name):
    match = re.search(r'\d+', name)
    return int(match.group()) if match else None

sorted_sheet_names = sorted(cleaned_sheets.keys(), key=extract_sheet_number)

In [13]:
final_sheets = []

for sheet_name in sorted_sheet_names:
    df = cleaned_sheets[sheet_name].copy()
    
    idx = extract_sheet_number(sheet_name)

    df["sheet_name"] = sheet_name
    
    df["education_level"] = lookup_category(idx, education_levels)
    df["urbanisation_degree"] = lookup_category(idx, degree_of_urbanisation)
    df["age_class"] = lookup_category(idx, age_class)
    df["sex"] = "Male"
    
    final_sheets.append(df)


In [14]:
print(final_sheets[0].head())

0                          DURATION (Labels) Zero minutes From 1 to 9 minutes  \
0                               GEO (Labels)          NaN                 NaN   
1  European Union - 27 countries (from 2020)        895.3              3714.5   
2  European Union - 28 countries (2013-2020)       1090.6              4328.2   
3      Euro area - 19 countries  (2015-2022)        624.2              3057.2   
4                                    Belgium         48.4                71.9   

0 From 1 to 14 minutes 1 minute or over From 10 to 14 minutes  \
0                  NaN              NaN                   NaN   
1               7822.7          28722.4                4108.2   
2               9170.1          34294.9                  4842   
3               6256.7          21383.1                3199.5   
4                161.5            744.3                  89.6   

0 From 15 to 19 minutes From 15 to 29 minutes From 20 to 29 minutes  \
0                   NaN                   NaN      

In [15]:
combined_df = pd.concat(final_sheets, ignore_index=True)

In [16]:
combined_df = combined_df[
    ~combined_df.iloc[:, 0].isin(["GEO (Labels)", "Special value", ":"])
].reset_index(drop=True)

In [17]:
combined_df = combined_df.rename(columns={"DURATION (Labels)": "Region"})

In [18]:
print(combined_df.columns.tolist())

['Region', 'Zero minutes', 'From 1 to 9 minutes', 'From 1 to 14 minutes', '1 minute or over', 'From 10 to 14 minutes', 'From 15 to 19 minutes', 'From 15 to 29 minutes', 'From 20 to 29 minutes', 'From 30 to 44 minutes', 'From 30 to 59 minutes', '30 minutes or over', 'From 45 to 59 minutes', '60 minutes or over', 'No response', 'sheet_name', 'education_level', 'urbanisation_degree', 'age_class', 'sex']


In [19]:
combined_df.to_excel("cleaned_commuting_data_male.xlsx", index=False)

In [None]:
# now let's join male and female data

In [24]:
female_df = pd.read_excel("cleaned_commuting_data_female.xlsx")
male_df = pd.read_excel("cleaned_commuting_data_male.xlsx")

In [25]:
print("Female:", female_df.shape, female_df["sex"].unique())
print("Male:", male_df.shape, male_df["sex"].unique())

Female: (4500, 20) ['Female']
Male: (4500, 20) ['Male']


In [26]:
combined_sex_df = pd.concat([female_df, male_df], ignore_index=True)


In [27]:
print(combined_sex_df["sex"].value_counts())


sex
Female    4500
Male      4500
Name: count, dtype: int64


In [28]:
combined_sex_df.to_excel("cleaned_commuting_data_all.xlsx", index=False)