In [1]:
import pandas as pd 
import numpy as np

from googletrans import Translator
translator = Translator()

def to_float(x):
    if type(x) == str and "," in x:
        x = x.replace(",", ".")
        return float(x)
    return x

In [2]:
def clean_data(clean_guide):
    filename = clean_guide["filename"]
    genesis_table = clean_guide["Table"],
    column_names = clean_guide["Columns"]
    column_names = column_names.split(", ")
    skiprows = clean_guide['Read Lines'][0] - 1
    nrows = clean_guide['Read Lines'][1] - skiprows
    na_values = clean_guide['NA']
    to_translate = clean_guide["To Translate"]

    df = pd.read_csv(filename, sep=";", encoding="latin-1", header=None, names=column_names, skiprows=skiprows, nrows=nrows, na_values=na_values)

    # Convert to numeric
    for col in column_names[clean_guide['to_numeric']:]:
        df[col] = df[col].apply(lambda x: to_float(x))

    if to_translate:
        for k, v in to_translate.items():
            if v: # can be optimized
                unique_vals = list(df[k].unique())
                translated_unique_vals = [translator.translate(x).text for x in unique_vals]
                mappings = dict(zip(unique_vals, translated_unique_vals))
                df[k] = df[k].apply(lambda x: mappings[x])
                df[k] = df[k].str.title()
            else:
                df[k] = df[k].apply(lambda x: translator.translate(x).text)
    
    return df

In [3]:
d1 = {
    "filename": "Sales Packaging-companies-end consumers-federal states-years.csv",
    "Table": "32136-0010",
    "Columns": "Year, State, Company, Sales packaging. v. private end users (1000t), Sales packaging. v. private end use per inhabitant (kg)",
    "Read Lines": [7, 182],
    "To Translate": {'State':1},
    "NA": [".", "-", "x"],
    "to_numeric": 2
}
d2 = {
    "filename": "Sales Packaging-companies-end consumers-years-return types.csv",
    "Table": "32136-0002",
    "Columns": "Year, Return Method, Unit, Sales Packaging After Sorting (Industrial Solutions), Sales Packaging After Sorting (System Operators), Overall Sales Packaging After Sorting",
    "Read Lines": [9, 63],
    "NA": [".", "x", "-"],
    "To Translate": {'Return Method': 0, 'Unit': 1},
    "to_numeric": 3
}
d3 = {
    "filename": "Sales Packaging-companies-end consumers-years.csv",
    "Table": "32136-0001",
    "Columns": "KPI, Unit, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020",
    "Read Lines": [8,12],
    "NA": [".", "-", "x"],
    "To Translate": {"KPI": 0},
    "to_numeric": 2,
    "pivot": True
}
d4 = {
    "filename": "Sales Packaging-end consumers-federal states-years-type of sales.csv",
    "Table": "32136-0011",
    "Columns": "Year, State, Mixed packaging, Packaging made of paper & cardboard & carton, Mixed glass (colored & mixed glass), Glass collected separately in color, Plastics collected separately, Metals collected separately, Composites collected separately, Total",
    "Read Lines": [8, 183],
    "NA": [".", "-"],
    "To Translate": {"State":1},
    "to_numeric": 2,
    "pivot":True
}
d5 = {
    "filename": "Sales Packaging-end consumers-years-return types-type of sales packaging.csv",
    "Table": "32136-0003",
    "Columns": "Year, Sales packaging. v. private end consumers, Industry solutions (1000 t), System operators (1000 t), Overall (1000 t)",
    "Read Lines": [10, 97],
    "NA": [".", "-"],
    "To Translate": {"Sales packaging. v. private end consumers": 0},
    "to_numeric": 2,
    "pivot":True
}
d6 = {
    "filename": "Sales Packaging-sorted-years-return types-recycling types.csv",
    "Table": "32136-0006",
    "Columns": "Year, Recycling Type, Sales packaging after sorting - Industry solutions (1000t), Sales packaging after sorting - System operators (1000t), Overall sales packaging after sorting (1000t)",
    "Read Lines": [9, 74],
    "NA": [".", "-"],
    "To Translate": {"Recycling Type":0},
    "to_numeric": 2,
    "pivot": True
}
d7 = {
    "filename": "Sales Packaging-years-types of material-types of recycling.csv",
    "Table": "32136-0005",
    "Columns": "Year, Recycling Material, Sales packaging after sorting (1000 t) - Material recycling, Sales packaging after sorting (1000 t) - Other forms of material recycling, Sales packaging after sorting (1000 t) - Energy recycling, Sales packaging after sorting (1000 t) - Other forms of recycling, Sales packaging after sorting (1000 t) - Other disposal, Total Sales packaging after sorting (1000 t)",
    "Read Lines": [9, 107],
    "NA": [".", "-"],
    "To Translate": {"Recycling Type":0},
    "to_numeric": 2,
    "pivot": True
}
d8 = {
    "filename": "Sales Packaging-years-types of material.csv",
    "Table": "32136-0004",
    "Columns": "Year, Material Type, Sales packaging after sorting (1000t), Handing over of sales packaging. n.d. Sort of abroad (1000t)",
    "Read Lines": [8, 106],
    "NA": [".", "-"],
    "To Translate": {'Material Type': 0},
    "to_numeric": 2
}

In [4]:
waste_df1 = clean_data(d1)
waste_df1.head()

Unnamed: 0,Year,State,Company,Sales packaging. v. private end users (1000t),Sales packaging. v. private end use per inhabitant (kg)
0,2010,Baden-Württemberg,,746.9,69
1,2010,Bayern,,800.1,64
2,2010,Berlin,,208.1,60
3,2010,Brandenburg,,205.6,82
4,2010,Bremen,,55.1,83


In [8]:
waste_df1.drop(columns=['Company'], inplace=True)

In [9]:
waste_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 4 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   Year                                                     176 non-null    int64  
 1   State                                                    176 non-null    object 
 2   Sales packaging. v. private end users (1000t)            176 non-null    float64
 3   Sales packaging. v. private end use per inhabitant (kg)  176 non-null    int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 5.6+ KB


In [None]:
waste_df1.to_csv(f"cleaned_{d1['Table']}_Sales-Packaging-{d1['filename']}", index=False)