In [1]:
import pandas as pd 
import numpy as np

from googletrans import Translator
translator = Translator()

def to_float(x):
    if type(x) == str and "," in x:
        x = x.replace(",", ".")
        return float(x)
    return x

In [2]:
d1 = {
    "filename": "Collection-transport-packing by states.csv",
    "Table": "32131-0003",
    "Columns": "Year, State, Disposal Of Packaging From Sorting systems (in-house and third-party), Disposal Of Packaging From Recycling companies (including waste materials trading), Total Disposal Of Packaging",
    "Read Lines": [8, 327],
    "To Translate": {'State':1},
    "NA": "-",
    "to_numeric": 2
}
d2 = {
    "filename": "waste collection - whereabouts of the packaging.csv",
    "Table": "32131-0002",
    "Columns": "Year, Packaging Description, Disposal Of Packaging From Sorting systems (in-house and third-party), Disposal Of Packaging From Recycling Companies (including waste materials trading), Total Disposal Of Packaging",
    "Read Lines": [9, 188],
    "To Translate": {'Packaging Description': 1},
    "NA": "-",
    "to_numeric": 2
}
d3 = {
    "filename": "waste collection-transport-packing.csv",
    "Table": "32131-0001",
    "Columns": "Year, Collected Transport And Secondary Packaging, Number Of Collectors",
    "Read Lines": [8, 27],
    "To Translate": None,
    "NA": ".",
    "to_numeric": 1
}

In [3]:
def clean_data(clean_guide):
    filename = clean_guide["filename"]
    genesis_table = clean_guide["Table"],
    column_names = clean_guide["Columns"]
    column_names = column_names.split(", ")
    skiprows = clean_guide['Read Lines'][0] - 1
    nrows = clean_guide['Read Lines'][1] - skiprows
    na_values = clean_guide['NA']
    to_translate = clean_guide["To Translate"]

    df = pd.read_csv(filename, sep=";", encoding="latin-1", header=None, names=column_names, skiprows=skiprows, nrows=nrows, na_values=na_values)

    # Convert to numeric
    for col in column_names[clean_guide['to_numeric']:]:
        df[col] = df[col].apply(lambda x: to_float(x))

    if to_translate:
        for k, v in to_translate.items():
            if v: # can be optimized
                unique_vals = list(df[k].unique())
                translated_unique_vals = [translator.translate(x).text for x in unique_vals]
                mappings = dict(zip(unique_vals, translated_unique_vals))
                df[k] = df[k].apply(lambda x: mappings[x])
                df[k] = df[k].str.title()
            else:
                df[k] = df[k].apply(lambda x: translator.translate(x).text)
    
    return df

In [4]:
waste_df1 = clean_data(d1)
waste_df1

Unnamed: 0,Year,State,Disposal Of Packaging From Sorting systems (in-house and third-party),Disposal Of Packaging From Recycling companies (including waste materials trading),Total Disposal Of Packaging
0,2001,Baden-Württemberg,265.4,310.5,575.9
1,2001,Bayern,154.2,549.2,703.3
2,2001,Berlin,102.1,36.2,138.3
3,2001,Brandenburg,90.8,35.9,126.7
4,2001,Bremen,7.5,19.5,27.0
...,...,...,...,...,...
315,2020,Saarland,43.9,22.1,66.0
316,2020,Saxony,79.5,120.1,199.6
317,2020,Saxony-Anhalt,45.5,49.9,95.4
318,2020,Schleswig-Holstein,21.6,56.9,78.5


In [5]:
waste_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 5 columns):
 #   Column                                                                              Non-Null Count  Dtype  
---  ------                                                                              --------------  -----  
 0   Year                                                                                320 non-null    int64  
 1   State                                                                               320 non-null    object 
 2   Disposal Of Packaging From Sorting systems (in-house and third-party)               320 non-null    float64
 3   Disposal Of Packaging From Recycling companies (including waste materials trading)  320 non-null    float64
 4   Total Disposal Of Packaging                                                         320 non-null    float64
dtypes: float64(3), int64(1), object(1)
memory usage: 12.6+ KB


In [6]:
waste_df1.to_csv(f"cleaned_{d1['Table']}_Waste-Collection-Transport-Packing-{d1['filename']}", index=False)

In [7]:
waste_df2 = clean_data(d2)
waste_df2

Unnamed: 0,Year,Packaging Description,Disposal Of Packaging From Sorting systems (in-house and third-party),Disposal Of Packaging From Recycling Companies (including waste materials trading),Total Disposal Of Packaging
0,2001,Packaging For Non-Harmful Filling Goods Made O...,22.8,55.7,78.5
1,2001,"Pack.F.N.Harmful.Filling Goods A. Paper, Cardb...",1327.7,1790.1,3117.8
2,2001,Packaging For Non-Hazardous Metal Fillings,39.4,75.3,114.7
3,2001,Pack.F.N.Harmful.Filling Goods Made Of Plastic,108.4,127.8,236.2
4,2001,Packaging For Non-Polluting Goods Made From Wood,162.0,259.5,421.6
...,...,...,...,...,...
175,2020,Packaging For Non-Polluting Goods Made From Wood,208.1,318.0,526.1
176,2020,Packaging For Non-Toxic Filling Goods From Com...,39.7,18.4,58.1
177,2020,Packaging For Harmful Substances And Filling G...,418.1,190.7,608.8
178,2020,Packaging For Filling Goods Containing Pollutants,5.8,7.4,13.2


In [8]:
waste_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 5 columns):
 #   Column                                                                              Non-Null Count  Dtype  
---  ------                                                                              --------------  -----  
 0   Year                                                                                180 non-null    int64  
 1   Packaging Description                                                               180 non-null    object 
 2   Disposal Of Packaging From Sorting systems (in-house and third-party)               180 non-null    float64
 3   Disposal Of Packaging From Recycling Companies (including waste materials trading)  180 non-null    float64
 4   Total Disposal Of Packaging                                                         180 non-null    float64
dtypes: float64(3), int64(1), object(1)
memory usage: 7.2+ KB


In [9]:
waste_df2.to_csv(f"cleaned_{d2['Table']}_Waste-Collection-Transport-Packing-{d2['filename']}", index=False)

In [14]:
waste_df3 = clean_data(d3)
waste_df3

Unnamed: 0,Year,Collected Transport And Secondary Packaging,Number Of Collectors
0,2001,4544.0,
1,2002,4554.9,1629.0
2,2003,4480.1,1616.0
3,2004,4646.9,1600.0
4,2005,4685.1,1526.0
5,2006,4571.7,1494.0
6,2007,4542.3,1470.0
7,2008,4245.9,1419.0
8,2009,4138.8,1337.0
9,2010,4263.8,1321.0


In [15]:
waste_df3.to_csv(f"cleaned_{d3['Table']}_Waste-Collection-Transport-Packing-{d3['filename']}", index=False)