In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_sqaured_error,r2_score,mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

In [None]:
df= pd.read_csv('/kaggle/input/jobs-dataset-from-glassdoor/eda_data.csv')
df.head()

In [None]:
df.isnull().sum()
df.dtypes

In [None]:
print("\nMissing Values:\n", df.isnull().sum())
sns.heatmap(df.isnull(), cbar = False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
df.hist(figsize=(15,10),bins=20)
plt.suptitle("Histograms of numerical features")
plt.show()

In [None]:
df['Sector'].unique()

In [None]:
df['Sector'].value_counts()

In [None]:
df['Sector'].replace('-1','other',inplace= True)
df['Sector'].value_counts()

In [None]:
# Top 10 Industries by Number of Job Postings
top_industries = df['Industry'].value_counts().head(10)

# Top 10 Industries by Average Salary
top_salary_industries = df.groupby('Industry')['avg_salary'].mean().sort_values(ascending=False).head(10)
# Top 10 Sectors by Number of Job Postings
top_sectors = df['Sector'].value_counts().head(10)

# Top 10 Sectors by Average Salary
top_salary_sectors = df.groupby('Sector')['avg_salary'].mean().sort_values(ascending=False).head(10)

# Visualization 1: Top Industries by Job Postings
plt.figure(figsize=(10,6))
sns.barplot(x=top_industries.values, y=top_industries.index, palette="viridis")
plt.title("Top 10 Industries by Job Postings")
plt.xlabel("Number of Jobs")
plt.ylabel("Industry")
plt.show()

# Visualization 2: Top Industries by Average Salary
plt.figure(figsize=(10,6))
sns.barplot(x=top_salary_industries.values, y=top_salary_industries.index, palette="magma")
plt.title("Top 10 Industries by Average Salary")
plt.xlabel("Average Salary (in $K)")
plt.ylabel("Industry")
plt.show()

# Visualization 3: Top Sectors by Job Postings
plt.figure(figsize=(10,6))
sns.barplot(x=top_sectors.values, y=top_sectors.index, palette="cubehelix")
plt.title("Top 10 Sectors by Job Postings")
plt.xlabel("Number of Jobs")
plt.ylabel("Sector")
plt.show()

# Visualization 4: Top Sectors by Average Salary
plt.figure(figsize=(10,6))
sns.barplot(x=top_salary_sectors.values, y=top_salary_sectors.index, palette="coolwarm")
plt.title("Top 10 Sectors by Average Salary")
plt.xlabel("Average Salary (in $K)")
plt.ylabel("Sector")
plt.show()

In [None]:
df_copy = df
x= df_copy.drop(columns=['Salary Estimate','min_salary','max_salary','avg_salary'])
x.head()

In [None]:
y= df['avg_salary']
y.head()

In [None]:
# Select categorical columns for one-hot encoding
categorical_cols = x.select_dtypes(include=['object']).columns

# Apply one-hot encoding
x_encoded = pd.get_dummies(x, columns=categorical_cols, drop_first=True)
xtrain,xtest,ytrain,ytest=train_test_split(x_encoded,y,test_size=0.2,random_state=42)

lin_reg=LinearRegression()
mse=cross_val_score(lin_reg,xtrain,ytrain,scoring='neg_mean_squared_error',cv=5)
mean_mse=np.mean(mse)
print(mean_mse)
lin_reg.fit(xtrain,ytrain)
ypred=lin_reg.predict(xtest)

r2_score(ytest,ypred)
print(r2_score(ytest,ypred))

In [None]:
ridge = Ridge()
params = {'alpha':[1e-15,1e-10,1e-9,2,3,4,5,10,50,70,80,90] }

ridge_regression = GridSearchCV(ridge, params, scoring='neg_mean_squared_error', cv=10)
ridge_regression.fit(xtrain, ytrain)

In [None]:
print(ridge_regression.best_params_)
print(ridge_regression.best_score_)

In [None]:
ypred_ridge=ridge_regression.predict(xtest)
r2_score(ytest,ypred_ridge)
print(r2_score(ytest,ypred_ridge))

In [None]:
# 1. Actual vs Predicted Plot (for Ridge Regression)
plt.figure(figsize=(8,6))
plt.scatter(ytest, ypred_ridge, alpha=0.6, color="blue")
plt.xlabel("Actual Salaries")
plt.ylabel("Predicted Salaries")
plt.title("Ridge Regression: Actual vs Predicted")
plt.show()

# 2. Residuals Distribution (Error Analysis)
residuals = ytest - ypred_ridge
plt.figure(figsize=(8,6))
sns.histplot(residuals, bins=30, kde=True, color="red")
plt.title("Residual Distribution (Ridge Regression)")
plt.xlabel("Error (Actual - Predicted)")
plt.show()