# Cleaning Enriched Bad Data

### Importing statements

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Reading the enriched bad data, considering 'NaN' values as pd.na
enriched_bad_data = pd.read_excel("/Users/rasimbaghirli/Desktop/RBaghirli_DataAnalyst_Assessment/Inputs/leads_enriched_data.xlsx", na_values='NaN')
enriched_bad_data.sample(6)

Unnamed: 0,firma,street,plz,city,telefon,country,country code,cleaned phone number,flag,salutation,first name,surname,digit length,firma length,unique_id,new_address,new_phone_number,enrichment
41,Fahrschule Schneider GbR,Hausacherstr. 10,78054,Villingen-Schwenningen,+49/152013 5893,DE,49,1520135893.0,bad data,No data,No data,No data,10.0,24.0,bad_42,,,not enriched
98,Praxis für Ergotherapie Kathrin Kenzler,Am Margaretenhof 26,19057,Schwerin,0049176-3343742,DE,49,1763343742.0,bad data,No data,No data,No data,10.0,39.0,bad_99,"Am Margaretenhof 26, 19057 Schwerin, Deutschland",0385 3035990,enriched
356,Bauteam VilmasSamstagSonntag,Gluckstraße 21 76185 Karlsruhe,0,"Karlsruhe, Baden-Württemberg",160922232711,DE,49,160922232711.0,bad data,No data,No data,No data,12.0,28.0,bad_357,"Gluckstraße 21, 76185 Karlsruhe, Deutschland",0160 92223272,enriched
140,Bäckerei - Konditorei Dietmar Stümper,Bergstraße 33,53819,Neunkirchen Seelscheid,00491762979442,DE,49,1762979442.0,bad data,No data,No data,No data,10.0,37.0,bad_141,"Bergstraße 33, 53819 Neunkirchen-Seelscheid, D...",02247 6139,enriched
336,montagarchitekten GmbH,Kellershaustraße 12,52078,Aachen,,DE,49,,bad data,No data,No data,No data,,22.0,bad_337,"Kellershaustraße 12, 52078 Aachen, Deutschland",0241 60846480,enriched
92,"Residenz Klingberg, Inh. Andrea Wohlgemuth",Gärtnerstraße 85,23684,Scharbeutz-Klingberg,Telefon: 00 075 CALL 9473 / 173,DE,49,759473173.0,bad data,No data,No data,No data,9.0,42.0,bad_93,,,not enriched


In [3]:
enriched_bad_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   firma                 383 non-null    object 
 1   street                376 non-null    object 
 2   plz                   379 non-null    object 
 3   city                  378 non-null    object 
 4   telefon               334 non-null    object 
 5   country               384 non-null    object 
 6   country code          384 non-null    int64  
 7   cleaned phone number  334 non-null    object 
 8   flag                  384 non-null    object 
 9   salutation            384 non-null    object 
 10  first name            384 non-null    object 
 11  surname               384 non-null    object 
 12  digit length          334 non-null    float64
 13  firma length          383 non-null    float64
 14  unique_id             384 non-null    object 
 15  new_address           3

In [4]:
enriched_bad_data.isna().sum()

firma                     1
street                    8
plz                       5
city                      6
telefon                  50
country                   0
country code              0
cleaned phone number     50
flag                      0
salutation                0
first name                0
surname                   0
digit length             50
firma length              1
unique_id                 0
new_address              80
new_phone_number        106
enrichment                0
dtype: int64

### Getting an insight over the enriched new_address column

In [5]:
enriched_bad_data['new_address'].sample(7, random_state = 43)

288                 Erlenweg 5, 8604 Volketswil, Schweiz
368                                                  NaN
27     Kleiststraße, 45472 Mülheim an der Ruhr, Deuts...
364                                                  NaN
177                                                  NaN
348                 Am Thie 2, 44869 Bochum, Deutschland
59               Biberweg 20, 56566 Neuwied, Deutschland
Name: new_address, dtype: object

**[Note] As shown, the new addresses are merge of street, post code, and country (plus other values as nan) that have to seperated**

In [6]:
# Step 1: Split 'new_address' by commas into 'street', 'city_plz', and 'country', while keeping NaN values intact
# Limit the split to 2 commas (3 parts) using n=2

split_columns = enriched_bad_data['new_address'].str.split(',', n=2, expand=True)

**[Note] First, I split street, plz + city, and country. Then I will split plz and city**

In [7]:
split_columns.head()

Unnamed: 0,0,1,2
0,Vahrenwalder Str. 141,30165 Hannover,Deutschland
1,Anton-Erhardt-Straße 5,09117 Chemnitz,Deutschland
2,Gildestraße 5,91154 Roth,Deutschland
3,Ottostraße 6,76227 Karlsruhe,Deutschland
4,,,


In [8]:
# Assign the split columns to new columns in the DataFrame, keeping NaN values

enriched_bad_data['street_new'] = split_columns[0]
enriched_bad_data['city_plz_new'] = split_columns[1]

***[Note] Since country code and country columns contain no missing values, We will not create new country column***

In [9]:
enriched_bad_data['city_plz_new'].unique()[:10]

array([' 30165 Hannover', ' 09117 Chemnitz', ' 91154 Roth',
       ' 76227 Karlsruhe', nan, ' 31629 Estorf', ' 59063 Hamm',
       ' 39418 Staßfurt', ' 17489 Greifswald', ' 70567 Stuttgart'],
      dtype=object)

**[Note] The city_plz_new contains white spaces that will be removed**

In [10]:
# Removing white spaces
enriched_bad_data['city_plz_new'] = enriched_bad_data['city_plz_new'].str.strip()

In [11]:
# Step 2: Only apply the split to 'city_plz' where it is not NaN
# We extract the first part as 'plz' and the rest (all words after the first) as 'city'
enriched_bad_data['plz_new'] = enriched_bad_data['city_plz_new'].apply(lambda x: x.split(' ', 1)[0] if isinstance(x, str) else None)
enriched_bad_data['city_new'] = enriched_bad_data['city_plz_new'].apply(lambda x: x.split(' ', 1)[1] if isinstance(x, str) and len(x.split(' ', 1)) > 1 else None)
 

[Note]
* The first line of code creates a new column plz_new by applying a lambda function to the city_plz_new column. The lambda function checks if the value is a string and then splits the string by the first space, extracting the first part (presumed to be the postal code or PLZ). If the value is not a string (such as NaN), it assigns None.
  
* The second line creates a new column city_new in a similar way. It checks if the value is a string and whether the split operation results in more than one part. If so, it extracts the second part (presumed to be the city name). If the value is not a string or there is no city name, it assigns None.

In [12]:
# Fill NaN values in 'street' with corresponding values from 'street_new'
enriched_bad_data['street'] = enriched_bad_data['street'].fillna(enriched_bad_data['street_new'])

In [13]:
# Fill NaN values in 'plz' with corresponding values from 'plz_new'
enriched_bad_data['plz'] = enriched_bad_data['plz'].fillna(enriched_bad_data['plz_new'])

In [14]:
# Fill NaN values in 'city' with corresponding values from 'city_new'
enriched_bad_data['city'] = enriched_bad_data['city'].fillna(enriched_bad_data['city_new'])

In [15]:
 # Fill NaN values in 'cleaned phone number' with corresponding values from 'new_phone_number'
enriched_bad_data['cleaned phone number'] = enriched_bad_data['cleaned phone number'].fillna(enriched_bad_data['new_phone_number'])

In [16]:
enriched_bad_data.isna().sum()

firma                     1
street                    1
plz                       0
city                      0
telefon                  50
country                   0
country code              0
cleaned phone number      9
flag                      0
salutation                0
first name                0
surname                   0
digit length             50
firma length              1
unique_id                 0
new_address              80
new_phone_number        106
enrichment                0
street_new               80
city_plz_new             80
plz_new                  80
city_new                 83
dtype: int64

***As a result of enrichment, in 'cleaned phone number', 'street', 'plz', 'city' columns, 41, 7, 5, 6 rows with missing values are enriched***

### Removing unnecessary column

In [17]:
# Remove the specified columns from the DataFrame
columns_to_remove = ['new_address','street_new', 'city_plz_new','plz_new', 'city_new']
enriched_bad_data = enriched_bad_data.drop(columns=columns_to_remove)

***[Note] Since there is a lot of unmatched values in 'cleaned phone number' (which is the initial cleaned phone numbers that did not count as valid) and 'new_phone_number' (which is the result of scraping) columns, I will keep the column for futher inverstigation***

In [None]:
# Compare the 'cleaned phone number' and 'new_phone_number' columns
phone_numbers_match = enriched_bad_data['cleaned phone number'] == enriched_bad_data['new_phone_number']

phone_numbers_match.value_counts() # There are only 41 matched values which is because of the 41 replacements above

In [19]:
enriched_bad_data

Unnamed: 0,firma,street,plz,city,telefon,country,country code,cleaned phone number,flag,salutation,first name,surname,digit length,firma length,unique_id,new_phone_number,enrichment
0,Autohaus Hentschel GmbH,Vahrenwalder Str. 141,30165,Hannover,+49_x001D_17_x0011_86221169,DE,49,117001186221169,bad data,No data,No data,No data,15.0,23.0,bad_1,0511 35250,enriched
1,Autohaus Siegmar GmbH,Anton-Erhardt-Straße 5,9117,Chemnitz,+_x001D__x0004_49179_x0008_9703167,DE,49,100044917900089703167,bad data,No data,No data,No data,21.0,21.0,bad_2,0371 850588,enriched
2,Autohaus Zückner GmbH & Co. KG,Gildestraße 5,91154,Roth,0049176-9142078,DE,49,1769142078,bad data,No data,No data,No data,10.0,30.0,bad_3,09171 97940,enriched
3,Auto Böhler,Ottostraße 6,76227,Karlsruhe,/179/00182_x000C__x0007_38,DE,49,17900182000000738,bad data,No data,No data,No data,17.0,11.0,bad_4,0721 409090,enriched
4,EJP Frank Bach & Katarzyna Bach GbR,Elisabethstrasse 24,2826,Görlitz,Hotline: 176-0699874 (+49),DE,49,1760699874,bad data,No data,No data,No data,10.0,35.0,bad_5,,not enriched
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,Kunz Metallbau GmbH,Adolf-Todt-Str. 28,65203,Wiesbaden,,DE,49,0611 609078,bad data,Herr,Richard,Kunz,,19.0,bad_380,0611 609078,enriched
380,Gebrüder Grüske GmbH,Meisenweg 17,82110,Germering,,DE,49,,bad data,Herr,Werner,Grüske,,20.0,bad_381,,not enriched
381,Ofenbau Unterseher GmbH,Kufsteiner Str. 49,83126,Flintsbach am Inn,,DE,49,08034 7067177,bad data,Herr,Georg,Unterseher,,23.0,bad_382,08034 7067177,enriched
382,Fastr GmbH,Kurfürstendamm 217,10719,Berlin,,DE,49,030 2363158920,bad data,Herr,Achim,Gasper,,10.0,bad_383,030 2363158920,enriched
