# Feature Engineering for 'data job posts' Dataset

In [None]:

# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


## Load the dataset

In [None]:

# Load the dataset
data = pd.read_csv("/mnt/data/data job posts.csv")
data.head()


## Convert the `date` column to datetime format

In [None]:

# Convert the 'date' column to datetime format
data['date'] = pd.to_datetime(data['date'], errors='coerce')


## Extract date-related features

In [None]:

# Extract date-related features
data['DayOfWeek'] = data['date'].dt.dayofweek  # Monday=0, Sunday=6
data['IsWeekend'] = data['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)  # Weekend flag
data['Month'] = data['date'].dt.month  # Extract month


## Aggregated features (job postings per month/year)

In [None]:

# Aggregated features (e.g., job postings per month/year)
job_post_counts = data.groupby([data['date'].dt.year, data['Month']]).size().reset_index(name='JobPostCount')
job_post_counts.rename(columns={0: 'Year'}, inplace=True)
job_post_counts.head()


## Encode categorical variables (`Title`)

In [None]:

# Limiting to the top frequent 'Title' categories
top_titles = data['Title'].value_counts().nlargest(20).index  # Top 20 most frequent titles
data['Title_Simplified'] = data['Title'].apply(lambda x: x if x in top_titles else 'Other')

# One-hot encode the simplified 'Title_Simplified'
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_titles = encoder.fit_transform(data[['Title_Simplified']].fillna('Unknown'))
title_encoded_df = pd.DataFrame(encoded_titles, columns=encoder.get_feature_names_out(['Title_Simplified']))

# Combine the encoded features back to the original dataset
data_encoded = pd.concat([data.reset_index(drop=True), title_encoded_df], axis=1)
data_encoded.head()


## Final Transformed Dataset

In [None]:

# Display the transformed dataset and job posting counts
data_encoded.head(), job_post_counts.head()
