## Data Cleaning

### Approach

- Importing and inspecting the data sets.
- After that -> cleaning the data accordingly.
- Creating Meta-Data for all the data sets.

In [1]:
# Importing python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import seaborn as sns
import requests

#### Reading Excel-file

In [2]:
#read the sheet
df_car_price_bonus_list = pd.read_excel('cars_bonus_prices.xlsx', sheet_name = 'Table 1')
df_car_price_bonus_list = df_car_price_bonus_list.fillna(method = 'ffill')

In [3]:
#Checking null values
df_car_price_bonus_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1907 entries, 0 to 1906
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Nr.                           1907 non-null   int64  
 1   Hersteller                    1907 non-null   object 
 2   Modell                        1907 non-null   object 
 3   BAFA-
Nettolistenpreis (EUR)  1907 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 59.7+ KB


In [4]:
df_car_price_bonus_list.shape

(1907, 4)

In [5]:
# calling the dataframe to check on it's content

df_car_price_bonus_list.head(3)

Unnamed: 0,Nr.,Hersteller,Modell,BAFA-\nNettolistenpreis (EUR)
0,1,Addax Motors,Addax MT8 (MJ 23),32650.0
1,2,Addax Motors,Addax MT15n (MJ 23),36400.0
2,3,Addax Motors,Addax MTX (MJ 23),37650.0


In [6]:
# Checking for the columns 

df_car_price_bonus_list.columns

Index(['Nr.', 'Hersteller', 'Modell', 'BAFA-\nNettolistenpreis (EUR)'], dtype='object')

In [7]:
# Translating column names from German to English
# Changing the names in a pythonic way. lower case and replacing ' ' with '_'

df_car_price_bonus_list.rename(columns={'Nr.': 'no.',
                                     'Hersteller': 'manufacturer',
                                     'Modell': 'brand',
                                     'BAFA-\nNettolistenpreis (EUR)': 'price_netto'}, inplace=True)

In [8]:
# Checking the new column names
df_car_price_bonus_list.columns

Index(['no.', 'manufacturer', 'brand', 'price_netto'], dtype='object')

In [9]:
# Getting rid of leading or trailing whitespaces

df_car_price_bonus_list = df_car_price_bonus_list.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [10]:
# Checking for duplicates
duplicates = df_car_price_bonus_list.duplicated()
print('Number of duplicate entries: ', duplicates.sum())
print(duplicates)

Number of duplicate entries:  0
0       False
1       False
2       False
3       False
4       False
        ...  
1902    False
1903    False
1904    False
1905    False
1906    False
Length: 1907, dtype: bool


In [11]:
# checking duplicates in brands

duplicates = df_car_price_bonus_list.duplicated('brand')
print(duplicates)

0       False
1       False
2       False
3       False
4       False
        ...  
1902    False
1903    False
1904    False
1905    False
1906     True
Length: 1907, dtype: bool


In [12]:
df_car_price_bonus_list.tail()

Unnamed: 0,no.,manufacturer,brand,price_netto
1902,1903,Zhidou,Elaris PIO 2022,16722.69
1903,1904,Zhidou,Elaris PIO A,18403.36
1904,1905,Zhidou,KWB (M1 17kWh),19700.0
1905,1906,Zhidou,KWB (M1 27kWh),20700.0
1906,1907,Addax Motors,Addax MT8 (MJ 23),32650.0


In [13]:
df_car_price_bonus_list[df_car_price_bonus_list['brand'] == 'Addax MT8 (MJ 23)']

Unnamed: 0,no.,manufacturer,brand,price_netto
0,1,Addax Motors,Addax MT8 (MJ 23),32650.0
1906,1907,Addax Motors,Addax MT8 (MJ 23),32650.0


In [14]:
#delete duplicate row

df_car_price_bonus_list.drop(1906, axis=0, inplace= True)




In [15]:
df_car_price_bonus_list.shape

(1906, 4)

In [16]:
df_car_price_bonus_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1906 entries, 0 to 1905
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   no.           1906 non-null   int64  
 1   manufacturer  1906 non-null   object 
 2   brand         1906 non-null   object 
 3   price_netto   1906 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 59.7+ KB


In [17]:
# Counting all unique zip codes from df_zip
df_car_price_bonus_list.brand.nunique()

1906

In [18]:
#Adding new column year + price_brutto

df_car_price_bonus_list['price_brutto'] = df_car_price_bonus_list['price_netto']*1.19
df_car_price_bonus_list['year'] = 2023

In [19]:
df_car_price_bonus_list.head(2)

Unnamed: 0,no.,manufacturer,brand,price_netto,price_brutto,year
0,1,Addax Motors,Addax MT8 (MJ 23),32650.0,38853.5,2023
1,2,Addax Motors,Addax MT15n (MJ 23),36400.0,43316.0,2023
