In [20]:
import pandas as pd 
import numpy as np

from googletrans import Translator
translator = Translator()

def to_float(x):
    if type(x) == str and "," in x:
        x = x.replace(",", ".")
        return float(x)
    elif type(x) == str and x.isnumeric():
        return eval(x)
    return x

In [21]:
def clean_data(clean_guide):
    filename = clean_guide["filename"]
    genesis_table = clean_guide["Table"],
    column_names = clean_guide["Columns"]
    column_names = column_names.split(", ")
    skiprows = clean_guide['Read Lines'][0] - 1
    nrows = clean_guide['Read Lines'][1] - skiprows
    na_values = clean_guide['NA']
    to_translate = clean_guide["To Translate"]

    df = pd.read_csv(filename, sep=";", encoding="latin-1", header=None, names=column_names, skiprows=skiprows, nrows=nrows, na_values=na_values)

    # Convert to numeric
    for col in column_names[clean_guide['to_numeric']:]:
        df[col] = df[col].apply(lambda x: to_float(x))

    if to_translate:
        for k, v in to_translate.items():
            if v: # can be optimized
                unique_vals = list(df[k].unique())
                translated_unique_vals = [translator.translate(x).text for x in unique_vals]
                mappings = dict(zip(unique_vals, translated_unique_vals))
                df[k] = df[k].apply(lambda x: mappings[x])
                df[k] = df[k].str.title()
            else:
                df[k] = df[k].apply(lambda x: translator.translate(x).text)
    
    return df

In [22]:
d1 = {
    "filename": "Waste Disposal-volume of waste-federal states-years.csv",
    "Table": "32121-0002",
    "Columns": "Year, State, Amount Of Household Waste (1000t), Amount Of Household Waste Per Inhabitant (kg), Household Waste Disposed Of By The First Recipient (1000t), Household Waste Recycled By The First Recipient (1000t)",
    "Read Lines": [6, 311],
    "To Translate": {'State':1},
    "NA": ["-","x", "."],
    "to_numeric": 2
}
d2 = {
    "filename": "Waste Disposal-volume of waste-states-years-types of waste.csv",
    "Table": "32121-0003",
    "Columns": "Year, State, Waste Type, Amount Of Household Waste (1000t), Amount Of Household Waste Per Inhabitant (kg)",
    "Read Lines": [7, 1536],
    "To Translate": {'Waste Type': 1, 'State':1},
    "NA": ["-","x", "."],
    "to_numeric": 3
}
d3 = {
    "filename": "Waste Disposal-volume of waste-years-type of waste.csv",
    "Table": "32121-0001",
    "Columns": "Year, Waste Type, Amount Of Household Waste (1000t), Amount Of Household Waste Per Inhabitant (kg), Household Waste Disposed By The First Recipient (1000t), Household Waste Recycled By The First Recipient (1000t)",
    "Read Lines": [8, 349],
    "To Translate": {'Waste Type': 0},
    "NA": ["-","x", "."],
    "to_numeric": 2
}

In [23]:
waste_df1 = clean_data(d1)
waste_df1

Unnamed: 0,Year,State,Amount Of Household Waste (1000t),Amount Of Household Waste Per Inhabitant (kg),Household Waste Disposed Of By The First Recipient (1000t),Household Waste Recycled By The First Recipient (1000t)
0,2004,Baden-Württemberg,4533.9,423,1563.3,2970.6
1,2004,Bayern,5678.6,456,2044.0,3634.6
2,2004,Berlin,1475.0,435,924.2,550.8
3,2004,Brandenburg,991.2,386,537.8,453.4
4,2004,Bremen,321.0,484,167.1,153.9
...,...,...,...,...,...,...
301,2021,Saxony,1749.9,433,52.6,1697.3
302,2021,Saxony-Anhalt,1035.5,477,0.5,1035.0
303,2021,Schleswig-Holstein,1488.4,509,6.2,1482.1
304,2021,Thuringia,968.4,459,386.4,582.0


In [24]:
waste_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 6 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Year                                                        306 non-null    int64  
 1   State                                                       306 non-null    object 
 2   Amount Of Household Waste (1000t)                           306 non-null    float64
 3   Amount Of Household Waste Per Inhabitant (kg)               306 non-null    int64  
 4   Household Waste Disposed Of By The First Recipient (1000t)  306 non-null    float64
 5   Household Waste Recycled By The First Recipient (1000t)     306 non-null    float64
dtypes: float64(3), int64(2), object(1)
memory usage: 14.5+ KB


In [25]:
waste_df2 = clean_data(d2)
waste_df2

Unnamed: 0,Year,State,Waste Type,Amount Of Household Waste (1000t),Amount Of Household Waste Per Inhabitant (kg)
0,2004,Baden-Württemberg,Household And Bulky Waste,1605.6,150.0
1,2004,Baden-Württemberg,Organic Waste Collected Separately,1220.5,114.0
2,2004,Baden-Württemberg,Recyclable Materials Collected Separately,1645.3,154.0
3,2004,Baden-Württemberg,Waste Electrical Appliances,,
4,2004,Baden-Württemberg,Other Waste,9.4,1.0
...,...,...,...,...,...
1525,2021,In Total,Household And Bulky Waste,16343.4,196.0
1526,2021,In Total,Organic Waste Collected Separately,11170.4,134.0
1527,2021,In Total,Recyclable Materials Collected Separately,12467.4,150.0
1528,2021,In Total,Waste Electrical Appliances,,


In [26]:
waste_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1530 entries, 0 to 1529
Data columns (total 5 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Year                                           1530 non-null   int64  
 1   State                                          1530 non-null   object 
 2   Waste Type                                     1530 non-null   object 
 3   Amount Of Household Waste (1000t)              1226 non-null   float64
 4   Amount Of Household Waste Per Inhabitant (kg)  1226 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 59.9+ KB


In [27]:
waste_df3 = clean_data(d3)
waste_df3

Unnamed: 0,Year,Waste Type,Amount Of Household Waste (1000t),Amount Of Household Waste Per Inhabitant (kg),Household Waste Disposed By The First Recipient (1000t),Household Waste Recycled By The First Recipient (1000t)
0,2004,Household and bulky waste,17045.8,207.0,16049.6,996.2
1,2004,"Household waste, commercial waste similar to h...",14452.4,175.0,14034.0,418.4
2,2004,Bulky waste,2593.4,31.0,2015.6,577.8
3,2004,Organic waste collected separately,8411.4,102.0,43.8,8367.6
4,2004,Waste from the bio bin,4121.1,50.0,13.4,4107.8
...,...,...,...,...,...,...
337,2021,Waste electrical appliances,,,,
338,2021,Other waste,274.8,3.0,101.1,173.7
339,2021,Other hazardous waste,84.0,1.0,23.7,60.3
340,2021,Other non-hazardous waste,190.9,2.0,77.4,113.4


In [28]:
waste_df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342 entries, 0 to 341
Data columns (total 6 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   Year                                                     342 non-null    int64  
 1   Waste Type                                               342 non-null    object 
 2   Amount Of Household Waste (1000t)                        326 non-null    float64
 3   Amount Of Household Waste Per Inhabitant (kg)            326 non-null    float64
 4   Household Waste Disposed By The First Recipient (1000t)  317 non-null    float64
 5   Household Waste Recycled By The First Recipient (1000t)  326 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 16.2+ KB


In [None]:
waste_df1.to_csv(f"cleaned_{d1['Table']}_Waste-Disposal-Public-{d1['filename']}", index=False)
waste_df2.to_csv(f"cleaned_{d2['Table']}_Waste-Disposal-Public-{d2['filename']}", index=False)
waste_df3.to_csv(f"cleaned_{d3['Table']}_Waste-Disposal-Public-{d3['filename']}", index=False)