In [1]:
import pandas as pd
import numpy as np

from googletrans import Translator
translator = Translator()

def to_float(x):
    if type(x) == str and "," in x:
        x = x.replace(",", ".")
        return float(x)
    return x

In [5]:
d1 = {
    "filename": "Asphalt mixing plants.csv",
    "Table": "32141-0003",
    "Columns": "Year, EAV Code, Waste Source, Number Of Asphalt Mixing Plants, Input Of Asphalt Mixing Plants (1000t)",
    "Read Lines": [7, 166],
    "To Translate": {"Waste Source":0},
    "NA": "-",
    "to_numeric": 3
}
d2 = {
    "filename": "Building rubble processing plants.csv",
    "Table": "32141-0002",
    "Columns": "Year, EAV Code, Waste Source, Number Of Construction Rubble Processing Plants, Input Of Construction Rubble Processing Plants (1000t), Number Of Construction Rubble Processing Plants With Output, Output Of Construction Rubble Processing Plants (1000t)",
    "Read Lines": [8, 1047],
    "To Translate": {"Waste Source":0},
    "NA": "-",
    "to_numeric": 3
}
d3 = {
    "filename": "Rubble and asphalt mixing plants by federal states.csv",
    "Table": "32141-0004",
    "Columns": "Year, State, Number Of Rubbish Processing Plants, Input Of Rubble Processing Plants (1000t), Capacity Of Rubble Processing Plants (1000t), Number Of Rubbish Processing Plants With Output, Output Of Rubble Processing Plants (1000t), Number Of Asphalt Mixing Plants, Input Of Asphalt Mixing Plants (1000t)",
    "Read Lines": [7, 142],
    "To Translate": {'State':1},
    "NA": [".","x"],
    "to_numeric": 2
}
d4 = {
    "filename": "Rubble processing plants, asphalt mixing plants.csv",
    "Table": "32141-0001",
    "Columns": "Year, Plant System, Number Of Rubbish Processing Plants, Input Of Rubble Processing Plants (1000t), Capacity Of Rubble Processing Plants (1000t), Number Of Rubbish Processing Plants With Output, Output Of Rubble Processing Plants (1000t), Number Of Asphalt Mixing Plants, Input Of Asphalt Mixing Plants (1000t)",
    "Read Lines": [8, 31],
    "To Translate": {"Plant System": 1},
    "NA": [".","x"],
    "to_numeric": 2
}

In [1]:
a = {1:2, 3:4}
for k, v in a.items():
    print(k, v)

1 2
3 4


In [3]:
def waste_processing_clean(clean_guide):
    filename = clean_guide["filename"]
    genesis_table = clean_guide["Table"],
    column_names = clean_guide["Columns"]
    column_names = column_names.split(", ")
    skiprows = clean_guide['Read Lines'][0] - 1
    nrows = clean_guide['Read Lines'][1] - skiprows
    na_values = clean_guide['NA']
    to_translate = clean_guide["To Translate"]

    df = pd.read_csv(filename, sep=";", encoding="latin-1", header=None, names=column_names, skiprows=skiprows, nrows=nrows, na_values=na_values)

    # Convert to numeric
    for col in column_names[clean_guide['to_numeric']:]:
        df[col] = df[col].apply(lambda x: to_float(x))

    if to_translate:
        for k, v in to_translate.items():
            if v: # can be optimized
                unique_vals = list(df[k].unique())
                translated_unique_vals = [translator.translate(x).text for x in unique_vals]
                mappings = dict(zip(unique_vals, translated_unique_vals))
                df[k] = df[k].apply(lambda x: mappings[x])
                df[k] = df[k].str.title()
            else:
                df[k] = df[k].apply(lambda x: translator.translate(x).text)
    
    return df


In [4]:
waste_df1 = waste_processing_clean(d1)
waste_df1

Unnamed: 0,Year,EAV Code,Waste Source,Number Of Asphalt Mixing Plants,Input Of Asphalt Mixing Plants (1000t)
0,2006,EAV-05,"Waste from petroleum refining, natural gas pur...",,
1,2006,EAV-050117-U,Petroleum refining: bitumen,,
2,2006,EAV-10,Waste from thermal processes,,
3,2006,EAV-100908-U,Casting molds and sand after casting (without ...,,
4,2006,EAV-17,Construction and demolition waste,386.0,6107.5
...,...,...,...,...,...
155,2020,EAV-19120904-U,products for use in asphalt mixing plants,331.0,9951.5
156,2020,EAV-19120905-U,products for other uses,,
157,2020,EAV-19120906-U,Hot mix for road and path construction,,
158,2020,EAV-99,Hazardous waste,,


In [7]:
waste_df1.to_csv(f"cleaned_{d1['Table']}_Waste-Processing-Facilities-{d1['filename']}", index=False)

In [10]:
waste_df2 = waste_processing_clean(d2)
waste_df2

Unnamed: 0,Year,EAV Code,Waste Source,Number Of Construction Rubble Processing Plants,Input Of Construction Rubble Processing Plants (1000t),Number Of Construction Rubble Processing Plants With Output,Output Of Construction Rubble Processing Plants (1000t)
0,2006,EAV-01,Waste from the extraction of mineral resources,,,,
1,2006,EAV-010413-U,Waste from stonemasonry and sawing work (exclu...,,,,
2,2006,EAV-02,"Waste from agriculture, forestry, etc.",,,,
3,2006,EAV-020401-U,Sugar production: beet soil,,,,
4,2006,EAV-03,"Waste from wood processing, etc.",,,,
...,...,...,...,...,...,...,...
1035,2020,EAV-200306-U,Waste from sewer cleaning,7.0,4.9,,
1036,2020,EAV-200307-U,Bulky waste,3.0,4.8,,
1037,2020,EAV-200399-U,Municipal waste n.e.c.,,,,
1038,2020,EAV-99,Hazardous waste,165.0,1059.1,104.0,553.1


In [11]:
waste_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 7 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Year                                                         1040 non-null   int64  
 1   EAV Code                                                     1040 non-null   object 
 2   Waste Source                                                 1040 non-null   object 
 3   Number Of Construction Rubble Processing Plants              396 non-null    float64
 4   Input Of Construction Rubble Processing Plants (1000t)       396 non-null    float64
 5   Number Of Construction Rubble Processing Plants With Output  325 non-null    float64
 6   Output Of Construction Rubble Processing Plants (1000t)      325 non-null    float64
dtypes: float64(4), int64(1), object(2)
memory usage: 57.0+ KB


In [12]:
waste_df2.to_csv(f"cleaned_{d2['Table']}_Waste-Processing-Facilities-{d2['filename']}", index=False)

In [7]:
waste_df3 = waste_processing_clean(d3)
waste_df3

Unnamed: 0,Year,State,Number Of Rubbish Processing Plants,Input Of Rubble Processing Plants (1000t),Capacity Of Rubble Processing Plants (1000t),Number Of Rubbish Processing Plants With Output,Output Of Rubble Processing Plants (1000t),Number Of Asphalt Mixing Plants,Input Of Asphalt Mixing Plants (1000t)
0,2006,Baden-Württemberg,239.0,6711.3,9028.2,,6808.6,60.0,1234.6
1,2006,Bayern,589.0,8146.7,13356.4,,8123.5,133.0,2668.8
2,2006,Berlin,24.0,2268.5,3013.9,,2390.2,5.0,89.8
3,2006,Brandenburg,120.0,5167.6,11382.7,,5731.2,39.0,447.9
4,2006,Bremen,4.0,148.4,216.7,,147.7,3.0,98.1
...,...,...,...,...,...,...,...,...,...
131,2020,Saxony,207.0,4759.4,,,4726.0,36.0,562.5
132,2020,Saxony-Anhalt,94.0,2583.2,,,2558.5,22.0,415.3
133,2020,Schleswig-Holstein,151.0,2548.7,,,2532.4,10.0,317.9
134,2020,Thuringia,74.0,1004.2,,,1007.3,16.0,273.4


In [8]:
waste_df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 9 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Year                                             136 non-null    int64  
 1   State                                            136 non-null    object 
 2   Number Of Rubbish Processing Plants              104 non-null    float64
 3   Input Of Rubble Processing Plants (1000t)        104 non-null    float64
 4   Capacity Of Rubble Processing Plants (1000t)     56 non-null     float64
 5   Number Of Rubbish Processing Plants With Output  0 non-null      float64
 6   Output Of Rubble Processing Plants (1000t)       104 non-null    float64
 7   Number Of Asphalt Mixing Plants                  120 non-null    float64
 8   Input Of Asphalt Mixing Plants (1000t)           116 non-null    float64
dtypes: float64(7), int64(1), object(1

In [9]:
waste_df3.to_csv(f"cleaned_{d3['Table']}_Waste-Processing-Facilities-{d3['filename']}", index=False)

In [16]:
waste_df4 = waste_processing_clean(d4)
waste_df4

Unnamed: 0,Year,Plant System,Number Of Rubbish Processing Plants,Input Of Rubble Processing Plants (1000t),Capacity Of Rubble Processing Plants (1000t),Number Of Rubbish Processing Plants With Output,Output Of Rubble Processing Plants (1000t),Number Of Asphalt Mixing Plants,Input Of Asphalt Mixing Plants (1000t)
0,2006,Stationary And Semi-Mobile Systems,683,27311.9,64840.7,,27796.8,522,9382.4
1,2006,Mobile Systems,1353,33076.4,33076.4,,32527.2,27,405.2
2,2006,In Total,2036,60388.3,97917.1,,60324.0,549,9787.6
3,2008,Stationary And Semi-Mobile Systems,716,29093.3,62476.5,,29127.4,532,10390.6
4,2008,Mobile Systems,1339,34935.8,34935.8,,34261.1,20,180.9
5,2008,In Total,2055,64029.2,97412.3,,63388.5,552,10571.5
6,2010,Stationary And Semi-Mobile Systems,746,29667.7,73830.4,,28028.1,534,10033.1
7,2010,Mobile Systems,1327,32860.5,32860.4,,32410.5,18,416.9
8,2010,In Total,2073,62528.2,106690.7,,60438.6,552,10450.0
9,2012,Stationary And Semi-Mobile Systems,779,29459.3,73240.1,,28646.8,514,11709.4


In [17]:
waste_df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 9 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Year                                             24 non-null     int64  
 1   Plant System                                     24 non-null     object 
 2   Number Of Rubbish Processing Plants              24 non-null     int64  
 3   Input Of Rubble Processing Plants (1000t)        24 non-null     float64
 4   Capacity Of Rubble Processing Plants (1000t)     24 non-null     float64
 5   Number Of Rubbish Processing Plants With Output  0 non-null      float64
 6   Output Of Rubble Processing Plants (1000t)       24 non-null     float64
 7   Number Of Asphalt Mixing Plants                  24 non-null     int64  
 8   Input Of Asphalt Mixing Plants (1000t)           24 non-null     float64
dtypes: float64(5), int64(3), object(1)


In [None]:
waste_df4.to_csv(f"cleaned_{d4['Table']}_Waste-Processing-Facilities-{d4['filename']}", index=False)