In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

import plotly.io as pio
pio.renderers.default = "notebook"

from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
import warnings
warnings. filterwarnings('ignore')

In [3]:
df = pd.read_csv("/content/survey_results_public.csv")

columns_to_keep = [
    'EdLevel', 'YearsCode', 'YearsCodePro', 'DevType', 'OrgSize',
    'Country', 'CompFreq', 'LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith',
    'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith', 'MiscTechHaveWorkedWith',
    'ToolsTechHaveWorkedWith', 'OpSysProfessional use', 'Age', 'Gender',
    'WorkExp', 'ICorPM', 'ConvertedCompYearly'
]

df = df[columns_to_keep]
df = df.dropna(subset=['ConvertedCompYearly'])

df['YearsCode'] = df['YearsCode'].replace({
    'Less than 1 year': '0.5',
    'More than 50 years': '50'
})
df['YearsCodePro'] = df['YearsCodePro'].replace({
    'Less than 1 year': '0.5',
    'More than 50 years': '50'
})

df['YearsCode'] = pd.to_numeric(df['YearsCode'], errors='coerce')
df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce')

df['YearsCode'] = df['YearsCode'].fillna(df['YearsCode'].median())
df['YearsCodePro'] = df['YearsCodePro'].fillna(df['YearsCodePro'].median())

label_encoder = LabelEncoder()

df['EdLevel'] = label_encoder.fit_transform(df['EdLevel'].fillna('Other'))
df['DevType'] = df['DevType'].fillna('Other').apply(lambda x: x.split(';')[0])
df['DevType'] = label_encoder.fit_transform(df['DevType'])
df['Country'] = label_encoder.fit_transform(df['Country'].fillna('Other'))
df['CompFreq'] = df['CompFreq'].map({'Yearly': 1, 'Monthly': 12, 'Weekly': 52}).fillna(1)

languages = df['LanguageHaveWorkedWith'].str.get_dummies(';')
df = pd.concat([df, languages], axis=1)

scaler = MinMaxScaler()
df[['YearsCode', 'YearsCodePro', 'ConvertedCompYearly']] = scaler.fit_transform(df[['YearsCode', 'YearsCodePro', 'ConvertedCompYearly']])
df = df.dropna()




In [11]:
from scipy import stats

z_scores = stats.zscore(df['ConvertedCompYearly'])
abs_z_scores = np.abs(z_scores)

threshold = 3
outliers_z = df[abs_z_scores > threshold]

df = df[abs_z_scores <= threshold]


In [None]:
pip install pycaret


In [12]:
from pycaret.regression import *
exp = setup(data=df, target='ConvertedCompYearly', session_id=123)
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,ConvertedCompYearly
2,Target type,Regression
3,Original data shape,"(1459, 61)"
4,Transformed data shape,"(1459, 103)"
5,Transformed train set shape,"(1021, 103)"
6,Transformed test set shape,"(438, 103)"
7,Numeric features,49
8,Categorical features,11
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
omp,Orthogonal Matching Pursuit,0.0018,0.0,0.0033,0.0114,0.0033,27.7485,0.232
huber,Huber Regressor,0.0014,0.0,0.0034,0.0061,0.0034,16.0956,0.533
llar,Lasso Least Angle Regression,0.0018,0.0,0.0034,-0.0131,0.0034,28.9565,0.237
dummy,Dummy Regressor,0.0018,0.0,0.0034,-0.0131,0.0034,28.9565,0.223
en,Elastic Net,0.0018,0.0,0.0034,-0.0131,0.0034,28.9565,0.239
lasso,Lasso Regression,0.0018,0.0,0.0034,-0.0131,0.0034,28.9565,0.231
knn,K Neighbors Regressor,0.0017,0.0,0.0034,-0.0225,0.0034,23.2443,0.231
ridge,Ridge Regression,0.0019,0.0,0.0035,-0.0859,0.0034,33.4827,0.232
ada,AdaBoost Regressor,0.002,0.0,0.0035,-0.1047,0.0035,34.0085,0.544
lightgbm,Light Gradient Boosting Machine,0.0018,0.0,0.0036,-0.1088,0.0035,29.3175,0.878


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [13]:
tuned_model = tune_model(best_model)


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0015,0.0,0.0022,-0.0833,0.0022,2.0354
1,0.0013,0.0,0.0018,-0.0511,0.0018,7.2144
2,0.0016,0.0,0.003,0.1098,0.003,1.406
3,0.0022,0.0,0.0042,0.0255,0.0042,2.8557
4,0.0023,0.0,0.0048,0.0361,0.0047,103.7998
5,0.0014,0.0,0.0027,-0.0671,0.0027,2.3207
6,0.0019,0.0,0.0039,0.1235,0.0038,1.6007
7,0.0019,0.0,0.004,0.0878,0.0039,91.618
8,0.0017,0.0,0.0032,0.0541,0.0032,2.3616
9,0.0017,0.0,0.0035,0.0035,0.0034,85.1905


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
