In [1]:
import pandas as pd

In [2]:
# Lade die CSV-Datei
cars = pd.read_csv('REM2030_v2015_car_info.csv', delimiter=';', encoding='ISO-8859-1')

In [3]:
# Zeige die ersten 5 Zeilen an
print("Erste 5 Zeilen von REM2030_v2015_car_info:")
display(cars.head())

Erste 5 Zeilen von REM2030_v2015_car_info:


Unnamed: 0,id,vehicle_size,economic_sector,nace_section,economic_segment,nace_division,description_of_the_economic_sector_according_to_company,city_size,company_size,comment,vehicle_utilization,number_of_users,parking_spot,federal_state,company_id
0,1106000161,small,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,O,Public administration and defence. compulsory ...,84,social insurance,up to 20000,51 to 250,,,,,BW,110615
1,1106000171,large,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,O,Public administration and defence. compulsory ...,84,social insurance,up to 20000,51 to 250,,,,,BW,110615
2,1106000181,small,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,O,Public administration and defence. compulsory ...,84,social insurance,up to 20000,51 to 250,,,,,BW,110615
3,1106000191,medium,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,O,Public administration and defence. compulsory ...,84,social insurance,up to 20000,51 to 250,,,,,BW,110615
4,1106000201,small,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,O,Public administration and defence. compulsory ...,84,social insurance,up to 20000,51 to 250,,,,,BW,110615


In [4]:
# Angegebene Spalten entfernen (Zum testen für nächstes Modell)
columns_to_drop = ['economic_segment', 'nace_division', 
                   'description_of_the_economic_sector_according_to_company', 'city_size', 
                   'company_size', 'comment', 'vehicle_utilization', 'number_of_users', 'parking_spot', 
                   'federal_state',]
cars = cars.drop(columns=[col for col in columns_to_drop if col in cars.columns])


In [5]:
# Zeige die ersten 5 Zeilen an
display(cars.head())

Unnamed: 0,id,vehicle_size,economic_sector,nace_section,company_id
0,1106000161,small,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,O,110615
1,1106000171,large,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,O,110615
2,1106000181,small,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,O,110615
3,1106000191,medium,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,O,110615
4,1106000201,small,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,O,110615


In [6]:
# Umwandlung von 'vehicle_size' in numerische Werte
# small = 1, medium = 2, large = 3, transporter = 4, special vehicle = 5
size_mapping = {'small': 1, 'medium': 2, 'large': 3, 'transporter': 4, 'special vehicle': 5}
cars['vehicle_size'] = cars['vehicle_size'].map(size_mapping)
print("'vehicle_size' erfolgreich in numerische Werte umgewandelt.")

# Umwandlung von 'nace_section' in numerische Werte   ### UNNÖTIG, SCHRITT LÖSCHEN
# Dabei wird jedem Buchstaben eine eindeutige Zahl zugewiesen (A=1, B=2, ..., Z=26)
nace_mapping = {chr(i): i - 64 for i in range(65, 91)}
cars['nace_section'] = cars['nace_section'].map(nace_mapping)
print("'nace_section' erfolgreich in numerische Werte umgewandelt.")

'vehicle_size' erfolgreich in numerische Werte umgewandelt.
'nace_section' erfolgreich in numerische Werte umgewandelt.


In [7]:
# Zeige die ersten 5 Zeilen an
display(cars.head())

Unnamed: 0,id,vehicle_size,economic_sector,nace_section,company_id
0,1106000161,1,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,15,110615
1,1106000171,3,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,15,110615
2,1106000181,1,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,15,110615
3,1106000191,2,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,15,110615
4,1106000201,1,PUBLIC ADMINISTRATION AND DEFENCE. COMPULSORY ...,15,110615


In [8]:
# Überprüfung, ob alle Werte numerisch sind
if cars['vehicle_size'].apply(lambda x: isinstance(x, (int, float))).all():
    print("Alle Werte in 'vehicle_size' sind numerisch.")
else:
    print("Nicht-numerische Werte in 'vehicle_size' gefunden.")

if cars['nace_section'].apply(lambda x: isinstance(x, (int, float))).all():
    print("Alle Werte in 'nace_section' sind numerisch.")
else:
    print("Nicht-numerische Werte in 'nace_section' gefunden.")

# Überprüfen, ob überall ein Wert angegeben ist
missing_values = cars.isnull().sum()
if missing_values.any():
    print("\nFehlende Werte gefunden:")
    print(missing_values)
else:
    print("\nKeine fehlenden Werte gefunden.")


Alle Werte in 'vehicle_size' sind numerisch.
Alle Werte in 'nace_section' sind numerisch.

Keine fehlenden Werte gefunden.


In [9]:
# Die geänderten Daten in einer neuen Datei speichern
cars.to_csv('REM2030_car_cleaned.csv', index=False)