# 1. Data Cleaning

**Importing libraries**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import math
import matplotlib.pyplot as plt
%matplotlib inline

**Loading data**

In [2]:
# Loading datasets
data_analyst = pd.read_csv('../datasets/raw/Data Analyst.csv')
data_engineer = pd.read_csv('../datasets/raw/Data Engineer.csv')
data_scientist = pd.read_csv('../datasets/raw/Data Scientist.csv')

# Combining datasets
df = pd.concat([data_analyst, data_engineer, data_scientist], ignore_index=True)

df

Unnamed: 0.1,Unnamed: 0,Job Title,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,0,Data Analyst,Data Analyst\nSingaporeOperationsExperienced\n...,3.9,TikTok\n3.9,Singapore,"Los Angeles, CA",1001 to 5000 employees,2016,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,-1
1,1,Data Analyst,Facebook's mission is to give people the power...,4.3,Facebook\n4.3,Singapore,"Menlo Park, CA",10000+ employees,2004,Company - Public,Internet,Information Technology,$10+ billion (SGD),"Google, Microsoft, Apple"
2,2,"Lead, Market Analyst (Data)",Job Description:\nGet to Know Our Team:\nThe t...,3.7,Grab\n3.7,Singapore,"Singapore, Singapore",5001 to 10000 employees,2012,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,"Uber, Google, Facebook"
3,3,Data Analyst,About Our Business\n\nWant to join the global ...,3.9,Cargill\n3.9,Singapore,"Wayzata, MN",10000+ employees,1865,Company - Private,Food Production,Agriculture & Forestry,$10+ billion (SGD),-1
4,4,Data Analyst,This is Adyen\n\nWe took an unobvious approach...,4.5,Adyen\n4.5,Singapore,"Amsterdam, Netherlands",501 to 1000 employees,2006,Company - Public,Financial Transaction Processing,Finance,$2 to $5 billion (SGD),"Worldpay, Stripe, Ingenico Group"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,795,#SGUNITEDJOBS ANALYTICS CONSULTANT – OFFICE OF...,Attain Analytics Group is looking for highly a...,-1.0,ATTAIN ANALYTICS GROUP PTE. LTD.,Singapore,-1,-1,-1,-1,-1,-1,-1,-1
1996,796,Data Analyst â€“ New Cost Allocation,Job Description :\nAn exciting opportunity for...,-1.0,Sciente Consulting Pte. Ltd,Singapore,-1,-1,-1,-1,-1,-1,-1,-1
1997,797,Data Analyst cum Engineer,Job Description :\n20 April 2020\nGreat Opport...,-1.0,Sciente Consulting Pte. Ltd,Singapore,-1,-1,-1,-1,-1,-1,-1,-1
1998,798,Senior Business Data Analyst #SgUnitedJobs,Job Summary\nOur client within the Financial S...,-1.0,Sciente Consulting Pte. Ltd,Singapore,-1,-1,-1,-1,-1,-1,-1,-1


### 1.1 Removing duplicates and missing values

In [3]:
# Drop duplicate rows
df = df.drop_duplicates()

# Replace missing value representations of "-1" and "-1.0" to NaN
df.replace({'-1': pd.NA, '-1.0': pd.NA}, inplace=True)
# Drop rows only if all the values in the row is "NaN" or the 'Job Description' column in the row has value "NaN"
df = df.dropna(how='all')
df = df.dropna(subset=['Job Description']) 
# Replace missing value representations from NaN to "-"
df = df.fillna('Unknown')
df[['Rating', 'Founded']] = df[['Rating', 'Founded']].astype(str).replace(['-1', '-1.0'], 'Unknown') # Convert columns with float and int data type to string for replacement

# Check and ensure that no missing values are represented as NaN
missing_values = df.isnull().sum()
print(missing_values)

df

Unnamed: 0           0
Job Title            0
Job Description      0
Rating               0
Company Name         0
Location             0
Headquarters         0
Size                 0
Founded              0
Type of ownership    0
Industry             0
Sector               0
Revenue              0
Competitors          0
dtype: int64


Unnamed: 0.1,Unnamed: 0,Job Title,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,0,Data Analyst,Data Analyst\nSingaporeOperationsExperienced\n...,3.9,TikTok\n3.9,Singapore,"Los Angeles, CA",1001 to 5000 employees,2016,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,Unknown
1,1,Data Analyst,Facebook's mission is to give people the power...,4.3,Facebook\n4.3,Singapore,"Menlo Park, CA",10000+ employees,2004,Company - Public,Internet,Information Technology,$10+ billion (SGD),"Google, Microsoft, Apple"
2,2,"Lead, Market Analyst (Data)",Job Description:\nGet to Know Our Team:\nThe t...,3.7,Grab\n3.7,Singapore,"Singapore, Singapore",5001 to 10000 employees,2012,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,"Uber, Google, Facebook"
3,3,Data Analyst,About Our Business\n\nWant to join the global ...,3.9,Cargill\n3.9,Singapore,"Wayzata, MN",10000+ employees,1865,Company - Private,Food Production,Agriculture & Forestry,$10+ billion (SGD),Unknown
4,4,Data Analyst,This is Adyen\n\nWe took an unobvious approach...,4.5,Adyen\n4.5,Singapore,"Amsterdam, Netherlands",501 to 1000 employees,2006,Company - Public,Financial Transaction Processing,Finance,$2 to $5 billion (SGD),"Worldpay, Stripe, Ingenico Group"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,795,#SGUNITEDJOBS ANALYTICS CONSULTANT – OFFICE OF...,Attain Analytics Group is looking for highly a...,Unknown,ATTAIN ANALYTICS GROUP PTE. LTD.,Singapore,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1996,796,Data Analyst â€“ New Cost Allocation,Job Description :\nAn exciting opportunity for...,Unknown,Sciente Consulting Pte. Ltd,Singapore,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1997,797,Data Analyst cum Engineer,Job Description :\n20 April 2020\nGreat Opport...,Unknown,Sciente Consulting Pte. Ltd,Singapore,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1998,798,Senior Business Data Analyst #SgUnitedJobs,Job Summary\nOur client within the Financial S...,Unknown,Sciente Consulting Pte. Ltd,Singapore,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


### 1.2 Correcting errors and inconsistencies

In [4]:
# Drop the "Unnamed: 0" column and reset the index
df.drop(columns=['Unnamed: 0'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Replace all the miscellaneous expressions in the dataframe with spaces
df = df.apply(lambda x: x.str.replace('â€¢', ' '))
df = df.apply(lambda x: x.str.replace('â€', ' '))

# Remove the ratings and websites from the back of company names
import re

def clean_company_name(company_name):
    name = company_name
    # Remove ratings
    company_name = re.sub(r'\s*\d+(\.\d+)?$', '', company_name)  # Checks for either decimal numbers or integers behind space
    # Remove websites
    company_name = re.sub(r'\s*\w+(\.com|\.co|\.sg)$', '', company_name) # Checks for the following word character configurations behind space
    # If the company name is empty after cleaning, we will revert to the original company name
    return company_name.strip() if company_name.strip() else name

df['Company Name'] = df['Company Name'].apply(clean_company_name)

df

Unnamed: 0,Job Title,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,Data Analyst,Data Analyst\nSingaporeOperationsExperienced\n...,3.9,TikTok,Singapore,"Los Angeles, CA",1001 to 5000 employees,2016,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,Unknown
1,Data Analyst,Facebook's mission is to give people the power...,4.3,Facebook,Singapore,"Menlo Park, CA",10000+ employees,2004,Company - Public,Internet,Information Technology,$10+ billion (SGD),"Google, Microsoft, Apple"
2,"Lead, Market Analyst (Data)",Job Description:\nGet to Know Our Team:\nThe t...,3.7,Grab,Singapore,"Singapore, Singapore",5001 to 10000 employees,2012,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,"Uber, Google, Facebook"
3,Data Analyst,About Our Business\n\nWant to join the global ...,3.9,Cargill,Singapore,"Wayzata, MN",10000+ employees,1865,Company - Private,Food Production,Agriculture & Forestry,$10+ billion (SGD),Unknown
4,Data Analyst,This is Adyen\n\nWe took an unobvious approach...,4.5,Adyen,Singapore,"Amsterdam, Netherlands",501 to 1000 employees,2006,Company - Public,Financial Transaction Processing,Finance,$2 to $5 billion (SGD),"Worldpay, Stripe, Ingenico Group"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,#SGUNITEDJOBS ANALYTICS CONSULTANT – OFFICE OF...,Attain Analytics Group is looking for highly a...,Unknown,ATTAIN ANALYTICS GROUP PTE. LTD.,Singapore,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1996,Data Analyst “ New Cost Allocation,Job Description :\nAn exciting opportunity for...,Unknown,Sciente Consulting Pte. Ltd,Singapore,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1997,Data Analyst cum Engineer,Job Description :\n20 April 2020\nGreat Opport...,Unknown,Sciente Consulting Pte. Ltd,Singapore,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1998,Senior Business Data Analyst #SgUnitedJobs,Job Summary\nOur client within the Financial S...,Unknown,Sciente Consulting Pte. Ltd,Singapore,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


**Export cleaned data**

In [5]:
df.to_csv('../datasets/processed/Cleaned_data.csv', index=False)