In [13]:
import pandas as pd
import numpy as np

In [14]:
# load data
df = pd.read_csv('data/job-64bd250cb0899621267166.csv' , encoding='ISO-8859-1')
df.head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48K+ *,"Computer Science,Data quality,Genetics,Mathema...",",,,,"
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48K+ *,"Agile,Data management,Finance,Security,,",",,,,"
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,,90K+ *,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,"
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48K+ *,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,"
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108K+,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi..."


## Checking for Null and Duplicates
---

### Nulls
---

In [15]:
# check if the Facilities column has some values that are Null but just filled with commas
len(df[df['Facilities'] == ',,,,'])

542

In [16]:
# check for null values
df.isnull().sum()

Company                         1
Job Title                       1
Location                        1
Job Type                        1
Experience level              236
Salary                        189
Requirment of the company       0
Facilities                      0
dtype: int64

In [17]:
# where the company name is null
df[df['Company'].isnull()]

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities
797,,,,,,,",,,,,",",,,,"


In [18]:
# drop the null row where the company name is null
df.dropna(subset=['Company'], inplace=True)
df.isnull().sum()

Company                         0
Job Title                       0
Location                        0
Job Type                        0
Experience level              235
Salary                        188
Requirment of the company       0
Facilities                      0
dtype: int64

In [19]:
# replace the null values in the Experience level column with 'Not Specified'
df['Experience level'].fillna('Not-Specified', inplace=True)
# replace the null values in the Salary column with 0
df['Salary'].fillna('Negociable', inplace=True)
df.isnull().sum()

Company                       0
Job Title                     0
Location                      0
Job Type                      0
Experience level              0
Salary                        0
Requirment of the company     0
Facilities                    0
dtype: int64

In [20]:
# replace the values with ,,,, in the Facilities column with 'Not Specified'
df['Facilities'].replace(',,,,', 'Not-Specified,', inplace=True)
# remove , if it is succeeded by a , in the Facilities column
df['Facilities'] = df['Facilities'].str.replace(r',,', ',')
df['Facilities'] = df['Facilities'].str.replace(r',,,', ',')
df['Facilities'] = df['Facilities'].str.replace(r',,,,', ',')
df['Facilities'] = df['Facilities'].str.replace(r',,,', ',')
df['Facilities'] = df['Facilities'].str.replace(r',,', ',')
# remove , if it is in the end of the string in the Facilities column
df['Facilities'] = df['Facilities'].str.replace(r',$', '')
df['Facilities']

0                                          Not-Specified,
1                                          Not-Specified,
2                                     Career development,
3                                          Not-Specified,
4       Flex hours,Flex vacation,Parental leave,Unlimi...
                              ...                        
3193                                  Career development,
3194                   Equity,Medical leave,Salary bonus,
3195                                       Not-Specified,
3196              Career development,Startup environment,
3197                                       Not-Specified,
Name: Facilities, Length: 3197, dtype: object

### Duplicates
---

In [21]:
# print duplicate rows based on all columns
df[df.duplicated()].sort_values(by=['Job Title'])

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities
1345,Netcentric,(Senior) Digital Analytics Engineer,"Barcelona, Spain",Full Time,Senior-level,110K+ *,"A/B testing,Angular,APIs,Computer Science,Engi...","Career development,Fitness / gym,Flex hours,Fl..."
698,Netcentric,(Senior) Digital Analytics Engineer,"Barcelona, Spain",Full Time,Senior-level,110K+ *,"A/B testing,Angular,APIs,Computer Science,Engi...","Career development,Fitness / gym,Flex hours,Fl..."
1936,Netcentric,(Senior) Digital Analytics Engineer,"Barcelona, Spain",Full Time,Senior-level,110K+ *,"A/B testing,Angular,APIs,Computer Science,Engi...","Career development,Fitness / gym,Flex hours,Fl..."
2683,Netcentric,(Senior) Digital Analytics Engineer,"Barcelona, Spain",Full Time,Senior-level,110K+ *,"A/B testing,Angular,APIs,Computer Science,Engi...","Career development,Fitness / gym,Flex hours,Fl..."
3151,Standard Bank Group,"80389998 - Engineer, Data","Johannesburg, South Africa",Full Time,Entry-level,30K+ *,"Agile,Architecture,Big Data,Data management,Da...","Startup environment,"
...,...,...,...,...,...,...,...,...
2358,Deutsche Telekom IT Solutions,Team Leader - Data Engineering (REF462H),"Budapest, Hungary",Full Time,Senior-level,115K+ *,"Agile,Consulting,Engineering,Security,,","Team events,"
1402,Roblox,"Technical Director, Machine Learning (Individu...","San Mateo, CA, United States",Full Time,Executive-level,Negociable,"Architecture,Deep Learning,Distributed Systems...","Career development,Equity,Flex hours,Flex vaca..."
1744,Charger Logistics Inc,Transportation Data Analyst Coordinator,"Santiago de Querétaro, Querétaro, Mexico",Full Time,Mid-level,65K+ *,"Computer Science,Economics,Excel,Finance,Mathe...","Competitive pay,Insurance,"
1101,Charger Logistics Inc,Transportation Data Analyst Coordinator,"Santiago de Querétaro, Querétaro, Mexico",Full Time,Mid-level,65K+ *,"Computer Science,Economics,Excel,Finance,Mathe...","Competitive pay,Insurance,"


In [22]:
# drop duplicates
df.drop_duplicates(inplace=True)
df[df.duplicated()].sort_values(by=['Job Title']).head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities


In [23]:
# export the cleaned data to a csv file
df.to_csv('data/cleaned_data.csv', index=False)