In [38]:
import pandas as pd 
import numpy as np
from googletrans import Translator

translator = Translator()

def to_float(x):
    if type(x) == str and "," in x:
        x = x.replace(",", ".")
        return float(x)
    elif type(x) == str and x.isnumeric():
        return eval(x)
    return x

In [44]:
d1 = {
    "filename": "Waste Disposal-State-Year-Facility Type.csv",
    "Table": "32111-0011",
    "Columns": "Year, State, Facility Type, Waste disposal facilities, Input from waste disposal facilities (1000t), Waste produced in-house (1000t), \
        Waste supplied from within Germany (1000t), Waste supplied from abroad (1000t), \
            Waste disposal facilities with output, Output from waste disposal facilities (1000t), Waste for disposal (1000t), \
                Waste for recycling (1000t), Waste All procedures to be prepared (1000t), Handover to other (1000t)",
    "Read Lines": [6, 3845],
    "To Translate": {"State": 1, "Facility Type": 0},
    "NA": ["-","x", "."],
    "to_numeric": 4
}
d2 = {
    "filename": "Waste Disposal-States-Years.csv",
    "Table": "32111-0010",
    "Columns": "Year, State, Waste disposal facilities, Input from waste disposal facilities, \
        Waste generated in-house, Waste delivered from within the country, Waste delivered from abroad",
    "Read Lines": [6, 69],
    "To Translate": {'State': 1},
    "NA": ["-","x", "."],
    "to_numeric": 2
}
d3 = {
    "filename": "Waste Disposal-years-type of facility-types of waste.csv",
    "Table": "32111-0004",
    "Columns": "Year, Facility Type, EAV Code, Waste Type, Waste disposal plants, Input from waste disposal plants (1000t), Waste generated in-house (1000t), \
        Waste delivered from within the country (1000t), Waste delivered from abroad (1000t), Waste disposal plants with output, \
            Output from waste disposal plants (1000t), Waste for disposal (1000t), Waste for recycling (1000t), Waste for preparatory processes (1000t), Delivery to others (1000t)",
    "Read Lines": [8, 218167],
    "To Translate": {'Facility Type': 1, 'Waste Type':0},
    "NA": ['-'],
    "to_numeric": 4
}
d4 = {
    "filename": "Waste Disposal-years-type of facility.csv",
    "Table": "32111-0003",
    "Columns": "Year, Facility Type, Waste disposal facilities, Input from waste disposal facilities (1000t), Waste produced on-site (1000t), \
        Waste supplied from home (1000t), Waste supplied from abroad (1000t), Waste disposal facilities with output, \
            Output from waste disposal facilities (1000t), Waste for disposal (1000t), Waste for recycling (1000t), \
                Waste all procedures to be prepared (1000t), Handover to others (1000t)",
    "Read Lines": [7, 726],
    "To Translate": {'Facility Type': 0},
    "NA": ["-", "x", "."],
    "to_numeric": 2
}
d5 = {
    "filename": "Waste Disposal-years-types of waste.csv",
    "Table": "32111-0002",
    "Columns": "Year, EAV Code, Waste Type, Waste disposal facilities, Input from waste disposal facilities (1000t), Waste generated in-house (1000t), \
        Waste delivered from within the country (1000t), Waste delivered from abroad (1000t)",
    "Read Lines": [7, 4466],
    "To Translate": {'Waste Type': 0},
    "NA": ["-", "x", "."],
    "to_numeric": 3
}
d6 = {
    "filename": "Waste Disposal-years.csv",
    "Table": "32111-0001",
    "Columns": "Year, Waste disposal facilities, Input from waste disposal facilities (1000t), Waste generated in-house, \
        Waste delivered from within the country, Waste delivered from abroad (1000t)",
    "To Translate": None,
    "NA": ["-", "x", "."],
    "Read Lines": [7, 11],
    "to_numeric": 0
}

In [40]:
def clean_data(clean_guide):
    filename = clean_guide["filename"]
    genesis_table = clean_guide["Table"],
    column_names = clean_guide["Columns"]
    column_names = column_names.split(", ")
    skiprows = clean_guide['Read Lines'][0] - 1
    nrows = clean_guide['Read Lines'][1] - skiprows
    na_values = clean_guide['NA']
    to_translate = clean_guide["To Translate"]

    df = pd.read_csv(filename, sep=";", encoding="latin-1", header=None, names=column_names, skiprows=skiprows, nrows=nrows, na_values=na_values)

    # Convert to numeric
    for col in column_names[clean_guide['to_numeric']:]:
        df[col] = df[col].apply(lambda x: to_float(x))

    if to_translate:
        for k, v in to_translate.items():
            if v: # can be optimized
                unique_vals = list(df[k].unique())
                translated_unique_vals = [translator.translate(x).text for x in unique_vals]
                mappings = dict(zip(unique_vals, translated_unique_vals))
                df[k] = df[k].apply(lambda x: mappings[x])
                df[k] = df[k].str.title()
            else:
                df[k] = df[k].apply(lambda x: translator.translate(x).text)
    
    return df

In [13]:
waste_df1 = clean_data(d1)
waste_df1.head()

Unnamed: 0,Year,State,Facility Type,Waste disposal facilities,Input from waste disposal facilities (1000t),Waste produced in-house (1000t),Waste supplied from within Germany (1000t),Waste supplied from abroad (1000t),Waste disposal facilities with output,Output from waste disposal facilities (1000t),Waste for disposal (1000t),Waste for recycling (1000t),Waste All procedures to be prepared (1000t),Handover to other (1000t)
0,2006,Baden-Württemberg,landfills,416.0,5391.9,35.2,5255.9,100.7,,245.1,64.2,180.9,,
1,2006,Baden-Württemberg,Thermal waste treatment plants,10.0,1612.2,93.9,1516.2,2.1,,437.5,11.2,413.5,,12.8
2,2006,Baden-Württemberg,combustion plants,24.0,1201.3,107.4,1044.0,49.9,,65.8,12.2,53.5,,0.0
3,2006,Baden-Württemberg,Chemical-physical treatment plants,31.0,267.7,3.2,250.5,14.1,,123.4,10.4,109.9,,3.1
4,2006,Baden-Württemberg,Soil treatment plants,3.0,96.3,0.0,92.5,3.8,,95.9,37.6,58.2,,0.0


In [14]:
waste_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3840 entries, 0 to 3839
Data columns (total 14 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Year                                                3840 non-null   int64  
 1   State                                               3840 non-null   object 
 2   Facility Type                                       3840 non-null   object 
 3   Waste disposal facilities                           2585 non-null   float64
 4   Input from waste disposal facilities (1000t)        2282 non-null   float64
 5   Waste produced in-house (1000t)                     1517 non-null   float64
 6           Waste supplied from within Germany (1000t)  2248 non-null   float64
 7   Waste supplied from abroad (1000t)                  1652 non-null   float64
 8               Waste disposal facilities with output   15 non-null     float64
 9

In [22]:
waste_df1.to_csv(f"cleaned_{d1['Table']}_Waste-Disposal-{d1['filename']}", index=False)

In [16]:
waste_df2 = clean_data(d2)
waste_df2.head()

Unnamed: 0,Year,State,Waste disposal facilities,Input from waste disposal facilities,Waste generated in-house,Waste delivered from within the country,Waste delivered from abroad
0,2018,Baden-Württemberg,1649,55746.4,1440.5,53144.3,1161.7
1,2018,Bayern,3777,71053.4,4554.3,66175.3,323.8
2,2018,Berlin,130,5164.9,206.1,4877.5,81.3
3,2018,Brandenburg,532,18289.7,789.5,17093.0,407.2
4,2018,Bremen,56,2791.0,130.7,2517.0,143.4


In [17]:
waste_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 7 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Year                                     64 non-null     int64  
 1   State                                    64 non-null     object 
 2   Waste disposal facilities                64 non-null     int64  
 3   Input from waste disposal facilities     64 non-null     float64
 4           Waste generated in-house         64 non-null     float64
 5   Waste delivered from within the country  64 non-null     float64
 6   Waste delivered from abroad              64 non-null     float64
dtypes: float64(4), int64(2), object(1)
memory usage: 3.6+ KB


In [20]:
waste_df1.columns = [x.strip() for x in waste_df1.columns]

In [25]:
waste_df2.columns = [x.strip() for x in waste_df2.columns]

In [29]:
waste_df2.to_csv(f"cleaned_{d2['Table']}_Waste-Disposal-{d2['filename']}", index=False)

In [37]:
waste_df3 = clean_data(d3)
waste_df3.head()

  df = pd.read_csv(filename, sep=";", encoding="latin-1", header=None, names=column_names, skiprows=skiprows, nrows=nrows, na_values=na_values)


ConnectTimeout: timed out

In [None]:
waste_df3.to_csv(f"cleaned_{d3['Table']}_Waste-Disposal-{d3['filename']}", index=False)

In [45]:
waste_df4 = clean_data(d4)
waste_df4.head()

Unnamed: 0,Year,Facility Type,Waste disposal facilities,Input from waste disposal facilities (1000t),Waste produced on-site (1000t),Waste supplied from home (1000t),Waste supplied from abroad (1000t),Waste disposal facilities with output,Output from waste disposal facilities (1000t),Waste for disposal (1000t),Waste for recycling (1000t),Waste all procedures to be prepared (1000t),Handover to others (1000t)
0,2006,landfills,1740.0,38727.1,9636.7,28635.8,454.6,,3752.6,2784.2,683.8,,284.5
1,2006,Class 0 landfills,1222.0,13337.2,1664.5,11672.7,,,394.0,146.5,183.7,,63.8
2,2006,Class I landfills,272.0,15811.9,6612.5,9149.1,50.3,,1258.8,995.7,133.2,,129.9
3,2006,Class II landfills,196.0,6691.2,394.1,6097.3,199.8,,1608.4,1240.9,362.3,,5.2
4,2006,Class III landfills,40.0,2715.2,965.2,1618.6,131.4,,481.4,401.1,0.0,,80.2


In [47]:
waste_df4.columns = [x.strip() for x in waste_df4.columns]

In [49]:
waste_df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 13 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Year                                           720 non-null    int64  
 1   Facility Type                                  720 non-null    object 
 2   Waste disposal facilities                      643 non-null    float64
 3   Input from waste disposal facilities (1000t)   640 non-null    float64
 4   Waste produced on-site (1000t)                 599 non-null    float64
 5   Waste supplied from home (1000t)               624 non-null    float64
 6   Waste supplied from abroad (1000t)             596 non-null    float64
 7   Waste disposal facilities with output          7 non-null      float64
 8   Output from waste disposal facilities (1000t)  569 non-null    float64
 9   Waste for disposal (1000t)                     565 non

In [50]:
waste_df4.to_csv(f"cleaned_{d4['Table']}_Waste-Disposal-{d4['filename']}", index=False)

In [46]:
waste_df4['Waste disposal facilities with output'].unique()

array([ nan, 314., 657., 697., 102.,  14.,   0.])

In [51]:
waste_df5 = clean_data(d5)
waste_df5.head()

Unnamed: 0,Year,EAV Code,Waste Type,Waste disposal facilities,Input from waste disposal facilities (1000t),Waste generated in-house (1000t),Waste delivered from within the country (1000t),Waste delivered from abroad (1000t)
0,2017,EAV-010101-U,Waste from mining v. metal-containing mineral ...,,,,,
1,2017,EAV-010102-U,Waste from mining v. non-metal Mineral resources,15.0,26619.5,26619.5,0.0,
2,2017,EAV-010304-G,Acid-forming processing residues a. sul. ore,,,,,
3,2017,EAV-010305-G,Other processing residues containing hazardous...,,,,,
4,2017,EAV-010306-U,Processing residues (without 010304 and 010305),10.0,6.3,3.6,2.7,


In [53]:
waste_df5.columns =[x.strip() for x in waste_df5.columns]

In [54]:
waste_df5.to_csv(f"cleaned_{d5['Table']}_Waste-Disposal-{d5['filename']}", index=False)

In [56]:
waste_df6 = clean_data(d6)
waste_df6.head()

Unnamed: 0,Year,Waste disposal facilities,Input from waste disposal facilities (1000t),Waste generated in-house,Waste delivered from within the country,Waste delivered from abroad (1000t)
0,2017,14625,417337.7,56730.8,351899.4,8707.5
1,2018,14196,421391.0,54037.8,359142.8,8210.4
2,2019,13876,420258.0,50261.1,362098.7,7898.3
3,2020,13744,417093.4,49541.7,360038.3,7513.4
4,2021,13660,414740.6,50852.9,356110.4,7777.2


In [58]:
waste_df6.columns = [x.strip() for x in waste_df6.columns]

In [59]:
waste_df6.to_csv(f"cleaned_{d6['Table']}_Waste-Disposal-{d6['filename']}", index=False)