## Here we are creating a (.env) file to hide our API_KEY and API_ID

In [3]:
import pandas as pd
import requests
import os
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
import time


# Loading vairable from .env
load_dotenv()

#Getting them on this file

API_ID = os.getenv("ADZUNA_APP_ID")
API_KEY = os.getenv("ADZUNA_APP_KEY")

## Now we are calling the API key to extract data from the web database to our pandas dataframe(df)

In [None]:
i = 1
url = f"https://api.adzuna.com/v1/api/jobs/gb/search/{i}?app_id={API_ID}&app_key={API_KEY}&results_per_page=20&what=javascript%20developer&content-type=application/json"
final_df = pd.DataFrame()

for i in range (1,300,1):
    response = requests.get(url)  # here the response variable is a object. Then in the next line we are converting response variable into json type.
    temp_data = response.json()['results']
    temp_df = pd.json_normalize(temp_data)
    final_df = pd.concat([final_df,temp_df],ignore_index=True)
    time.sleep(1)  # Here we are using this time method to create a small delay so we would hit the server API request limit.


    

In [None]:
final_df['avg_salary'] = (final_df["salary_min"] + final_df["salary_max"]) / 2
final_df= final_df[['title','company.display_name','created','location.display_name','salary_max','salary_min','category.label','avg_salary']]
final_df.head()

# Now we are doing some EDA(Exploratory data analysis) to better understand the data we are working on

In [None]:
sns.set(style="whitegrid", palette="muted", font_scale=1.1)
print("Shape of dataset:", final_df.shape)
print("\nColumns:", final_df.columns.tolist())
print("\nMissing values:\n", final_df.isna().sum())

In [None]:
# --- 2. Salary Preprocessing ---
avg_salary = (final_df["salary_min"] + final_df["salary_max"]) / 2



# --- 3. Top Job Titles ---
top_titles = final_df["title"].value_counts().head(10)
print("\nTop Job Titles:\n", top_titles)

plt.figure(figsize=(10,5))
sns.barplot(y=top_titles.index, x=top_titles.values)
plt.title("Top 10 Job Titles")
plt.xlabel("Number of Postings")
plt.ylabel("Job Title")
plt.show()

In [None]:
final_df.to_csv('dataset.csv')

In [5]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,company.display_name,created,location.display_name,salary_max,salary_min,category.label,avg_salary
0,0,JavaScript Developer - Hybrid (Manchester ),Circle Group,2025-08-20T13:27:38Z,"Manchester, Greater Manchester",40000.0,40000.0,IT Jobs,40000.0
1,1,Full-Stack Javascript Developer – Central Lond...,Nexus Jobs Limited,2025-08-13T14:20:32Z,"London, UK",44198.56,44198.56,IT Jobs,44198.56
2,2,Javascript Developer,Track24,2025-09-06T00:35:08Z,"Finsbury, Central London",60000.0,60000.0,IT Jobs,60000.0
3,3,Javascript Developer,Lloyds Bank,2025-08-31T17:44:01Z,UK,58410.0,47790.0,IT Jobs,53100.0
4,4,JavaScript Developer,Sellick Partnership,2025-09-04T14:19:06Z,"Newcastle Upon Tyne, Tyne & Wear",55000.0,48000.0,IT Jobs,51500.0


In [None]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv('dataset.csv', index_col=0)

# ========================================
# FEATURE ENGINEERING
# ========================================

print("🔧 FEATURE ENGINEERING")
print("-" * 40)

# 1. Create target variable (average salary)
df['avg_salary'] = (df['salary_min'] + df['salary_max']) / 2
print("✅ Created 'avg_salary' - target variable for prediction")

# 2. Handle datetime and create days since posted (FIXED VERSION)
df['created'] = pd.to_datetime(df['created'])

# Fix timezone compatibility
if df['created'].dt.tz is not None:
    current_time = pd.Timestamp.now(tz=df['created'].dt.tz)
else:
    current_time = pd.Timestamp.now()

df['days_since_posted'] = (current_time - df['created']).dt.days
df['days_since_posted'] = df['days_since_posted'].clip(lower=0)  # Remove negative values
print("✅ Created 'days_since_posted' - job posting age in days")

# 3. Title-based features
df['title_length'] = df['title'].str.len()
print("✅ Created 'title_length' - length of job title")

# 4. Binary features from job titles (1 = True, 0 = False)
df['is_senior'] = df['title'].str.contains('Senior|Lead|Principal', case=False, na=False).astype(int)
df['is_fullstack'] = df['title'].str.contains('Fullstack|Full Stack|Full-Stack', case=False, na=False).astype(int)
df['is_frontend'] = df['title'].str.contains('Frontend|Front End|Front-End', case=False, na=False).astype(int)
df['is_backend'] = df['title'].str.contains('Backend|Back End|Back-End', case=False, na=False).astype(int)
df['is_react'] = df['title'].str.contains('React', case=False, na=False).astype(int)
df['is_node'] = df['title'].str.contains('Node|NodeJS|Node.js', case=False, na=False).astype(int)

print("✅ Created binary features from job titles:")
print(f"   - is_senior: {df['is_senior'].sum()} jobs")
print(f"   - is_fullstack: {df['is_fullstack'].sum()} jobs") 
print(f"   - is_frontend: {df['is_frontend'].sum()} jobs")
print(f"   - is_backend: {df['is_backend'].sum()} jobs")
print(f"   - is_react: {df['is_react'].sum()} jobs")
print(f"   - is_node: {df['is_node'].sum()} jobs")

# 5. Salary-based features
df['salary_range'] = df['salary_max'] - df['salary_min']
df['salary_category'] = pd.cut(df['avg_salary'], 
                              bins=[0, 30000, 50000, 70000, float('inf')], 
                              labels=['Entry', 'Mid', 'Senior', 'Lead'])
print("✅ Created salary features:")
print(f"   - salary_range: difference between min and max salary")
print(f"   - salary_category: Entry/Mid/Senior/Lead based on salary")

# 6. Location-based features
df['is_london'] = df['location.display_name'].str.contains('London', case=False, na=False).astype(int)
df['is_remote'] = df['location.display_name'].str.contains('Remote|Work from home', case=False, na=False).astype(int)
print("✅ Created location features:")
print(f"   - is_london: {df['is_london'].sum()} jobs in London")
print(f"   - is_remote: {df['is_remote'].sum()} remote jobs")

# 7. Company name length (proxy for company size/type)
df['company_name_length'] = df['company.display_name'].str.len()
print("✅ Created 'company_name_length' - proxy for company type")

# 8. Summary statistics
print(f"\n📊 FEATURE ENGINEERING SUMMARY:")
print(f"   Original columns: {len(['title', 'company.display_name', 'created', 'location.display_name', 'salary_max', 'salary_min', 'category.label'])}")
print(f"   New features created: 12")
print(f"   Total columns now: {len(df.columns)}")

# 9. Display new features summary
new_features = ['avg_salary', 'days_since_posted', 'title_length', 'is_senior', 
               'is_fullstack', 'is_frontend', 'is_backend', 'is_react', 'is_node',
               'salary_range', 'salary_category', 'is_london', 'is_remote', 'company_name_length']

print(f"\n📋 NEW FEATURES OVERVIEW:")
for feature in new_features:
    if feature in df.columns:
        if df[feature].dtype in ['int64', 'float64']:
            print(f"   {feature}: {df[feature].min()} to {df[feature].max()}")
        else:
            print(f"   {feature}: {df[feature].value_counts().to_dict()}")

# Save the enhanced dataset
df.to_csv('dataset_with_features.csv')
print(f"\n💾 Enhanced dataset saved as 'dataset_with_features.csv'")
print(f"✅ Ready for machine learning!")

🔧 FEATURE ENGINEERING
----------------------------------------
✅ Created 'avg_salary' - target variable for prediction


AttributeError: 'datetime.timezone' object has no attribute 'iloc'