In [1]:
import pandas as pd
import numpy as np

# Data cleaning

The big dataset has some issues:

- possible white space in stings
- Column names have spaces instead of _
- Column names and certain dimensions (age) have long names
- Date and floats are stored as strings
- Null values are actually stored as :

In [None]:
# strip all white space from fields

df = df.astype(str).apply(lambda x: x.str.strip())

In [None]:
# rename columns with metrics (minutes)

df.rename(columns={'Zero minutes':'0', 'From 1 to 9 minutes': '1-9', 'From 1 to 14 minutes': '1-14', '1 minute or over': '1+', 'From 10 to 14 minutes': '10-14', 'From 15 to 19 minutes': '15-19', 'From 15 to 29 minutes': '15-29', 'From 20 to 29 minutes': '20-29', 'From 30 to 44 minutes': '30-44', 'From 30 to 59 minutes': '30-59', '30 minutes or over': '30+', 'From 45 to 59 minutes': '45-59', '60 minutes or over': '60+', 'No response: No_response'}, inplace=True)


In [None]:
# rename columns with other dimensions

df.rename(columns={'Time':'Year', 'International Standard Classification of Education (ISCED 2011)': 'Education', 'Degree of urbanisation': 'Urbanization_degree',}, inplace=True)


In [None]:
# rename dimesntions (age) to drop unnecesary...

df['Age class'] = df['Age class'].str.replace('From ', '').str.replace(' to ', '-').str.replace(' years', '')

In [None]:
# change Year to year date type

df['Year'] = pd.to_datetime(df['Year'], format='%Y')

In [None]:
# change the values - drop the comma and change the decimal separator to dot

columns_to_convert = ['0', '1-9', '1-14', '1+', '10-14', '15-19', '15-29', '20-29', '30-44', '30-59', '30+', '45-59', '60+', 'No_response']

for col in columns_to_convert:
    df[col] = df[col].str.replace('.', '', regex=False).str.replace(',', '.', regex=False)
    df[col] = pd.to_numeric(df[col], errors='coerce')




In [None]:
# create null values out of : if the previous code does not work

# df = df.replace(to_replace=':.*', value=np.nan, regex=True)

In [None]:
# drop if country has no value (meaning row is supposed to be empty)

df.dropna(subset=['Region'], inplace=True)

In [None]:
# check for missing values

df.isna().sum()

# Ovelapping dimensions

Commuting dimensions are overlapping, e.g. "From 1 to 9 minutes and	From 1 to 14 minutes"

To make it managable I am suggesting storing it in different df by the level of granularity.

In [None]:
#granular_labels = ['0','1-9', '10-14', '15-19', '20-29', '30-44', '45-59', '60+', 'No_response']

#medium_labels = ['0', '1-14', '15-29', '30-59', '60+', 'No_response']

#broad_labels = ['0', '1+', '30+', '60+', 'No_response']



In [None]:
granular_labels_drop = ['1-14', '15-29', '30-59', '1+', '30+']

medium_labels_drop = ['1-9', '10-14', '15-19', '20-29', '30-44', '45-59','1+', '30+',]

broad_labels_drop = ['1-9', '10-14', '15-19', '20-29', '30-44', '45-59', '1-14', '15-29', '30-59']



In [None]:
df_granular = df.copy()
dr_granular.drop(columns = granular_labels_drop, inplace=True)

In [None]:
df_medium = df.copy()
dr_medium.drop(columns = medium_labels_drop, inplace=True)

In [None]:
df_broad = df.copy()
dr_broad.drop(columns = broad_labels_drop, inplace=True)