In [18]:
# Importing Dependencies
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

# **Data Cleaning Exploration**
### **Data Ingestion**


In [19]:
# Load the data
job_postings = pd.read_csv('../../../../data/job_postings.csv')

In [20]:
job_postings


Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type
0,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite
1,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite
2,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite
3,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite
4,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12212,https://uk.linkedin.com/jobs/view/data-reporti...,2024-01-21 07:11:22.099082+00,Finished NER,t,t,f,"Data Reporting Manager, FOOTBALL ASSOCIATION",Guardian Jobs,"Wembley, England, United Kingdom",2024-01-16,High Wycombe,United Kingdom,Manager Forms Analysis,Mid senior,Onsite
12213,https://www.linkedin.com/jobs/view/corporate-a...,2024-01-19 15:10:41.177008+00,Finished NER,t,t,f,Corporate AML Alert Investigation Specialist,"Glacier Bancorp, Inc.","Kalispell, MT",2024-01-14,Montana,United States,Teller,Mid senior,Onsite
12214,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:20:19.036168+00,Finished NER,t,t,f,Senior Data Scientist,Highnote,"San Francisco, CA",2024-01-16,San Rafael,United States,Mathematician,Mid senior,Onsite
12215,https://www.linkedin.com/jobs/view/senior-data...,2024-01-19 23:25:28.107523+00,Finished NER,t,t,f,Senior Data Engineer,CompSource Mutual Insurance Company,"Oklahoma City, OK",2024-01-16,Arcadia,United States,Protection Engineer,Mid senior,Onsite


### **Removing Null or Missing Rows**

In [21]:
# Eliminate rows with null or missing values
cleaned_job_postings = job_postings.dropna()


### **Remove Non US data**

In [22]:
# Define a list of Valid US State Abbreviations
valid_states = {
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
}

# define empty lists to store the city and state
cities = []
states = []

In [23]:
# iterate through the job_location column
for location in cleaned_job_postings['job_location']:
    city_state = location.split(', ')
    
    if len(city_state) == 2 and city_state[1] in valid_states:
        cities.append(city_state[0])
        states.append(city_state[1])
    else:
        cities.append(None)
        states.append(None)
        
# add the cities and states columns to the dataframe
cleaned_job_postings['city'] = cities
cleaned_job_postings['state'] = states

# drop the job_location column
cleaned_job_postings = cleaned_job_postings.drop(columns=['job_location'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_job_postings['city'] = cities
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_job_postings['state'] = states


In [24]:
cleaned_job_postings

Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,first_seen,search_city,search_country,search_position,job_level,job_type,city,state
0,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-21 08:08:48.031964+00,Finished NER,t,t,f,Senior Machine Learning Engineer,Jobs for Humanity,2024-01-14,East Haven,United States,Agricultural-Research Engineer,Mid senior,Onsite,New Haven,CT
1,https://www.linkedin.com/jobs/view/principal-s...,2024-01-20 04:02:12.331406+00,Finished NER,t,t,f,"Principal Software Engineer, ML Accelerators",Aurora,2024-01-14,El Cerrito,United States,Set-Key Driver,Mid senior,Onsite,San Francisco,CA
2,https://www.linkedin.com/jobs/view/senior-etl-...,2024-01-21 08:08:31.941595+00,Finished NER,t,t,f,Senior ETL Data Warehouse Specialist,Adame Services LLC,2024-01-14,Middletown,United States,Technical Support Specialist,Associate,Onsite,New York,NY
3,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:30:55.796572+00,Finished NER,t,t,f,Senior Data Warehouse Developer / Architect,Morph Enterprise,2024-01-12,Lebanon,United States,Architect,Mid senior,Onsite,Harrisburg,PA
4,https://www.linkedin.com/jobs/view/lead-data-e...,2024-01-21 08:08:58.312124+00,Finished NER,t,t,f,Lead Data Engineer,Dice,2024-01-14,McKinney,United States,Maintenance Data Analyst,Mid senior,Onsite,Plano,TX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12212,https://uk.linkedin.com/jobs/view/data-reporti...,2024-01-21 07:11:22.099082+00,Finished NER,t,t,f,"Data Reporting Manager, FOOTBALL ASSOCIATION",Guardian Jobs,2024-01-16,High Wycombe,United Kingdom,Manager Forms Analysis,Mid senior,Onsite,,
12213,https://www.linkedin.com/jobs/view/corporate-a...,2024-01-19 15:10:41.177008+00,Finished NER,t,t,f,Corporate AML Alert Investigation Specialist,"Glacier Bancorp, Inc.",2024-01-14,Montana,United States,Teller,Mid senior,Onsite,Kalispell,MT
12214,https://www.linkedin.com/jobs/view/senior-data...,2024-01-20 15:20:19.036168+00,Finished NER,t,t,f,Senior Data Scientist,Highnote,2024-01-16,San Rafael,United States,Mathematician,Mid senior,Onsite,San Francisco,CA
12215,https://www.linkedin.com/jobs/view/senior-data...,2024-01-19 23:25:28.107523+00,Finished NER,t,t,f,Senior Data Engineer,CompSource Mutual Insurance Company,2024-01-16,Arcadia,United States,Protection Engineer,Mid senior,Onsite,Oklahoma City,OK
