# Model Analyser for DS Jobs Project

This notebook evaluates multiple regression models, selects the best one with R² > 0.7, and checks for both overfitting and underfitting. Overfitting is flagged if the train-test R² gap is too large; underfitting is flagged if both scores are low.

In [9]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [12]:
# Load your data
# Update the path and target column as needed
train_data = pd.read_csv(r'..\Artifacts\2025-07-01-10-46-39\data_validation\valid_data\train.csv')
test_data = pd.read_csv(r'..\Artifacts\2025-07-01-10-46-39\data_validation\valid_data\test.csv')

In [13]:
data = pd.concat([train_data, test_data], ignore_index=True)

In [14]:
data.head()

Unnamed: 0,company,job_title,company_rating,job_description,location,salary_avg_estimate,salary_estimate_payperiod,company_size,company_founded,employment_type,industry,sector,revenue,career_opportunities_rating,comp_and_benefits_rating,culture_and_values_rating,senior_management_rating,work_life_balance_rating
0,"Medpace, Inc.",Data Coordinator - Core Laboratory,3.4,Job Summary :\nOur corporate activities are gr...,Thāne,589237.0,/yr (est.),5001 to 10000 Employees,1992,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$2 to $5 billion (USD),3.3,3.0,3.2,3.0,3.2
1,Nissan,Data Engineer I,3.9,Key Responsibilities:\n\nEnsure that the guide...,Thiruvananthapuram,443415.0,/yr (est.),10000+ Employees,1933,Company - Public,Transportation Equipment Manufacturing,Manufacturing,$2 to $5 billion (USD),3.6,3.7,3.7,3.5,3.4
2,ZF,Senior Data scientist,3.9,What's Next? Join ZF!\nZF is a global technolo...,Chennai,360000.0,/yr (est.),10000+ Employees,1915,Company - Public,Transportation Equipment Manufacturing,Manufacturing,$10+ billion (USD),3.6,3.7,3.7,3.3,3.8
3,One Sigma Technologies,Data Analyst Fraud,3.7,Bengaluru\nEngineering /\nFull-Time\n/ On-Site...,Bengaluru,777892.0,/yr (est.),501 to 1000 Employees,2015,Company - Private,Internet & Web Services,Information Technology,Unknown / Non-Applicable,3.7,3.9,3.7,3.6,3.6
4,Infosys,Data Privacy_Consulting,3.8,A day in the life of an Infoscion • As part of...,Bengaluru,720103.0,/yr (est.),10000+ Employees,1981,Company - Public,Information Technology Support Services,Information Technology,$10+ billion (USD),3.8,3.0,4.0,3.5,3.7


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558 entries, 0 to 557
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   company                      558 non-null    object 
 1   job_title                    558 non-null    object 
 2   company_rating               558 non-null    float64
 3   job_description              558 non-null    object 
 4   location                     558 non-null    object 
 5   salary_avg_estimate          558 non-null    float64
 6   salary_estimate_payperiod    558 non-null    object 
 7   company_size                 558 non-null    object 
 8   company_founded              558 non-null    int64  
 9   employment_type              558 non-null    object 
 10  industry                     558 non-null    object 
 11  sector                       558 non-null    object 
 12  revenue                      558 non-null    object 
 13  career_opportunities

In [25]:
# Split data
x_train, x_test, y_train, y_test = train_test_split(
	data.iloc[:, :-1], data.iloc[:, -1], test_size=0.2, random_state=42
)

In [26]:
num_cols = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = x_train.select_dtypes(include=['object']).columns.tolist()

In [27]:
num_pipeline = Pipeline([
                ('scaler', StandardScaler())
            ])

cat_pipeline = Pipeline([
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ])

preprocessor = ColumnTransformer([
                ('num', num_pipeline, num_cols),
                ('cat', cat_pipeline, cat_cols)
            ]).set_output(transform='pandas')

In [28]:
# Define models and parameter grids
models = {
    'LinearRegression': (LinearRegression(), {}),
    'Lasso': (Lasso(), {'alpha': [0.01, 0.1, 1, 10]}),
    'Ridge': (Ridge(), {'alpha': [0.01, 0.1, 1, 10]}),
    'DecisionTreeRegressor': (DecisionTreeRegressor(), {'max_depth': [3, 5, 7, None]}),
    'KNeighborsRegressor': (KNeighborsRegressor(), {'n_neighbors': [3, 5, 7]}),
    'RandomForestRegressor': (RandomForestRegressor(random_state=42), {'n_estimators': [50, 100], 'max_depth': [5, 10, None]}),
    'GradientBoostingRegressor': (GradientBoostingRegressor(random_state=42), {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}),
    'SVR': (SVR(), {'C': [0.1, 1, 10]})
}

In [31]:
# Evaluate models with preprocessing pipeline
results = []
for name, (model, params) in models.items():
    pipe = Pipeline([('preprocessor', preprocessor), ('reg', model)])
    if params:
        # Adjust parameter grid for pipeline
        param_grid = {f'reg__{k}': v for k, v in params.items()}
        grid = GridSearchCV(pipe, param_grid, cv=5, scoring='r2', n_jobs=-1)
        grid.fit(x_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
    else:
        best_model = pipe.fit(x_train, y_train)
        best_params = None
    y_train_pred = best_model.predict(x_train)
    y_test_pred = best_model.predict(x_test)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    mae = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    overfit_gap = train_r2 - test_r2
    underfit = train_r2 < 0.7 and test_r2 < 0.7
    results.append({
        'Model': name,
        'Train R2': train_r2,
        'Test R2': test_r2,
        'Overfit Gap': overfit_gap,
        'MAE': mae,
        'MSE': mse,
        'Best Params': best_params,
        'Underfit': underfit
    })

In [32]:
# Results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Test R2', ascending=False)
results_df

Unnamed: 0,Model,Train R2,Test R2,Overfit Gap,MAE,MSE,Best Params,Underfit
0,LinearRegression,1.0,0.773055,0.226945,0.151961,0.051518,,False
2,Ridge,0.999998,0.772745,0.227253,0.152164,0.051589,{'reg__alpha': 0.01},False
1,Lasso,0.741175,0.714134,0.027041,0.173452,0.064894,{'reg__alpha': 0.01},False
6,GradientBoostingRegressor,0.915757,0.714111,0.201646,0.173995,0.064899,"{'reg__learning_rate': 0.1, 'reg__n_estimators...",False
5,RandomForestRegressor,0.969601,0.703095,0.266506,0.160161,0.0674,"{'reg__max_depth': None, 'reg__n_estimators': ...",False
7,SVR,0.95898,0.619609,0.339371,0.17907,0.086352,{'reg__C': 1},False
4,KNeighborsRegressor,0.806653,0.618988,0.187665,0.204286,0.086493,{'reg__n_neighbors': 5},False
3,DecisionTreeRegressor,0.836546,0.558593,0.277952,0.224175,0.100203,{'reg__max_depth': 5},False


In [33]:
# Filter models: Test R2 > 0.7, Overfit Gap < 0.1, not underfit
filtered = results_df[(results_df['Test R2'] > 0.7) & (results_df['Overfit Gap'].abs() < 0.1) & (~results_df['Underfit'])]
filtered

Unnamed: 0,Model,Train R2,Test R2,Overfit Gap,MAE,MSE,Best Params,Underfit
1,Lasso,0.741175,0.714134,0.027041,0.173452,0.064894,{'reg__alpha': 0.01},False


## Conclusion

- The best model(s) are those with high test R² (>0.7), low overfit gap (<0.1), and not underfitting.
- Review the `filtered` table above for your best candidates.
- You can further analyze residuals or feature importances for the top model.