In [1]:
import pandas as pd
import numpy as np

In [2]:
# Set pandas option to display all columns
pd.set_option("display.max_columns", None)

# Read the dataset into a DataFrame
job_data = pd.read_csv("gsearch_jobs.csv")

In [3]:
# Display the first five rows of the DataFrame
job_data.head()

# Display basic dataset information
print("Data Types:\n", job_data.dtypes)
print("Dataset Shape:", job_data.shape)
print("Column Names:", job_data.columns)

# Descriptive statistics for numerical columns
print("Descriptive Statistics:\n", job_data.describe())

# Check for missing values and display their sums
print("Missing Values:\n", job_data.isna().sum())

# Display value counts for the "salary_rate" column
print("Salary Rate Value Counts:\n", job_data["salary_rate"].value_counts())

# Display value counts for the "company_name" column
print("Company Name Value Counts:\n", job_data["company_name"].value_counts())


Data Types:
 Unnamed: 0               int64
index                    int64
title                   object
company_name            object
location                object
via                     object
description             object
extensions              object
job_id                  object
thumbnail               object
posted_at               object
schedule_type           object
work_from_home          object
salary                  object
search_term             object
date_time               object
search_location         object
commute_time           float64
salary_pay              object
salary_rate             object
salary_avg             float64
salary_min             float64
salary_max             float64
salary_hourly          float64
salary_yearly          float64
salary_standardized    float64
description_tokens      object
dtype: object
Dataset Shape: (27389, 27)
Column Names: Index(['Unnamed: 0', 'index', 'title', 'company_name', 'location', 'via',
       'description',

In [4]:
# Replace True with 1 and NaN with 0 in the "work_from_home" column
job_data["work_from_home"] = job_data["work_from_home"].notnull().astype(int)

In [5]:
# Remove "via " from the "via" column to clean the data
job_data["via"] = job_data["via"].str.replace("via ", "") 

In [6]:
# Define columns to drop
columns_to_drop = ["Unnamed: 0", "index","description", "thumbnail", "posted_at", "commute_time", "salary", "salary_hourly", "salary_yearly"]

# Drop the specified columns
job_data.drop(columns=columns_to_drop, inplace=True)


In [7]:
# Check for duplicated "job_id" values and print the sum
duplicates = job_data["job_id"].duplicated().sum()
print("Duplicate job_id Values:", duplicates)

# Drop duplicate rows based on the "job_id" column
job_data.drop_duplicates(subset="job_id", inplace=True)

# Reorder columns with "job_id" as the first column
column_names = job_data.columns.tolist()
column_names.remove("job_id")
column_names.insert(0, "job_id")
job_data = job_data[column_names]


Duplicate job_id Values: 20


In [8]:
# Convert the "date_time" column to datetime
job_data["date_time"] = pd.to_datetime(job_data["date_time"])

In [9]:
# Format the "date_time" column as "yyyy-mm-dd" and store it in the same column
job_data["date_time"] = job_data["date_time"].dt.strftime("%Y-%m-%d")

# Rename the "date_time" column to "date"
job_data.rename(columns={"date_time": "date"}, inplace=True)

In [10]:
# Create a new column "salary_info_status" based on "salary_pay" column
job_data["salary_info_status"] = job_data["salary_pay"].isna().replace({True: "Not Specified", False: "Available"})

In [11]:
# Initialize empty "city" and "state" columns
job_data.insert(3, "city", "")
job_data.insert(4, "state", "")

# Split the "Location" column into "city" and "state" columns where possible
for index, row in job_data.iterrows():
    location = row['location']
    if pd.notna(location):
        try:
            city, state = location.strip().split(', ')
        except ValueError:
            pass
        else:
            job_data.at[index, 'city'] = city
            job_data.at[index, 'state'] = state

# Drop the original "location" column
job_data.drop("location", axis=1, inplace=True)


In [12]:
# Drop rows where state has more than 2 characters (invalid values)
job_data.drop(job_data[job_data.state.str.len() > 2].index, inplace=True)

In [13]:
# Initialize the 'title_group' column with 'Other'
job_data['title_group'] = 'Other' 

# Define the conditions and corresponding values
conditions = [
    (job_data['title'].str.contains(r'analy(?:z|s|t)(?:e|is|ic|t)s?', case=False) &
     job_data['title'].str.contains(r'engineerg?', case=False)),
    (job_data['title'].str.contains(r'analy(?:z|s|t)(?:e|is|ic|t)s?', case=False) &
     job_data['title'].str.contains(r'scientists?|sciences?', case=False)),
    (job_data['title'].str.contains(r'engineerg?', case=False) &
     job_data['title'].str.contains(r'scientists?|sciences?', case=False)),
    job_data['title'].str.contains(r'analy(?:z|s|t)(?:e|is|ic|t)s?', case=False),
    job_data['title'].str.contains(r'engineerg?', case=False),
    job_data['title'].str.contains(r'scientists?|sciences?', case=False)
]

# Define the corresponding values for each condition
values = [
    'Analyst/Engineer',
    'Analyst/Scientist',
    'Engineer/Scientist',
    'Analyst',
    'Engineer',
    'Scientist'
]

# Use numpy's select function to apply the conditions and assign values accordingly
job_data['title_group'] = np.select(conditions, values, default='Other')


In [14]:
# Save the DataFrame to a CSV file
job_data.to_csv("jobdata_processed_data.csv", index=False)
