In [141]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('explored_data_for_model.csv')
to_drop = [col for col in df.columns if 'unnamed' in col.lower()]

In [142]:
df = df.drop(columns=to_drop)

In [143]:
features = ['Rating','Size','Type of ownership','Industry', 'Sector', 'Revenue','num_comp','job_state','same_state_as_hq','company_text',
            'company_age','python_yn', 'spark_yn', 'cloud_yn','deployments_yn','viz_tools_yn', 'api_dev_yn','job_title_simplified', 'seniority','jd_length']

In [144]:
#building preprocessing pipeline
numerical_features = df[features].select_dtypes(include=np.number).columns.to_list()
categorical_features = df[features].select_dtypes(include='object').columns.to_list()
bool_features = [x for x in features if x not in numerical_features and x not in categorical_features]
df[bool_features] = df[bool_features].astype(int) #Converting bool features to int

numerical_features = numerical_features + bool_features

In [145]:
df[numerical_features].dtypes

Rating              float64
num_comp              int64
company_age           int64
jd_length             int64
same_state_as_hq      int64
python_yn             int64
spark_yn              int64
cloud_yn              int64
deployments_yn        int64
viz_tools_yn          int64
api_dev_yn            int64
dtype: object

In [146]:
df[categorical_features].dtypes

Size                    object
Type of ownership       object
Industry                object
Sector                  object
Revenue                 object
job_state               object
company_text            object
job_title_simplified    object
seniority               object
dtype: object

In [147]:
nominal = ['Size', 'Revenue']
ordinal = [x for x in categorical_features if x not in nominal]

In [148]:
# Splitting dataset
y = df['avg_salary']
X = df[features]

from sklearn.model_selection import train_test_split, cross_validate

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state= 201)

In [185]:
# Creating Encoding pipelines for different types of variables
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer, MinMaxScaler, PowerTransformer, KBinsDiscretizer, StandardScaler
from sklearn.compose import ColumnTransformer


ordinal_pipeline = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)                  #categories=[['Yes','No']]*len(ordinal)), MinMaxScaler()
nominal_pipeline = OneHotEncoder(drop='first', handle_unknown = 'ignore')
numeric_pipeline = make_pipeline(PowerTransformer('yeo-johnson'), MinMaxScaler())


preprocessing_pipeline = ColumnTransformer(transformers=[
    ('ordinal_pipeline',ordinal_pipeline,ordinal),
    ('nominal_pipeline',nominal_pipeline,nominal),
    ('numeric_pipeline',numeric_pipeline,numerical_features)
])




In [186]:

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

lm = LinearRegression()
test_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessing_pipeline),  # Preprocessing step
    ('model', lm)
])

test_pipeline.fit(X_train,y_train)
preds = test_pipeline.predict(X_test)
score = mean_absolute_error(y_test, preds)
score

24.244971238809715

In [190]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn import tree

models= {
    'LR':LinearRegression(),
    'RF':RandomForestRegressor(),           #try 'XGB': XGBClassifier(),
    'DTregressor': tree.DecisionTreeRegressor(),
    'svm':svm.SVR(),
    'Lasso':Lasso(alpha=0.5)
    }

result = []
for name,model in models.items():
    final_pipeline = make_pipeline(preprocessing_pipeline,model)
    cv = cross_validate(final_pipeline,X_train,y_train, cv = 5, return_train_score=True, scoring='neg_mean_absolute_error')
    result.append(pd.DataFrame(cv).mean().to_frame().set_axis([name],axis = 1))

scores = pd.concat(result, axis = 1)



In [191]:
scores

Unnamed: 0,LR,RF,svm,Lasso
fit_time,0.027515,0.209965,0.017595,0.02227
score_time,0.008161,0.005523,0.00683,0.004828
test_score,-24.230345,-15.790293,-29.1721,-24.032356
train_score,-22.160506,-5.503463,-29.077808,-22.85194


In [182]:
models= {
    'LR':LinearRegression(),
    'RF':RandomForestRegressor(),           #try 'XGB': XGBClassifier(),
    'Lasso':Lasso(alpha=0.5)
    }

result = []
for name,model in models.items():
    final_pipeline = make_pipeline(preprocessing_pipeline,model)
    cv = cross_validate(final_pipeline,X_train,y_train, cv = 5, return_train_score=True, scoring='r2')
    result.append(pd.DataFrame(cv).mean().to_frame().set_axis([name],axis = 1))

scores = pd.concat(result, axis = 1)



In [183]:
scores

Unnamed: 0,LR,RF,Lasso
fit_time,0.060883,0.200435,0.019739
score_time,0.007884,0.005584,0.005147
test_score,0.295401,0.592671,0.280461
train_score,0.400181,0.952158,0.34995


In [119]:

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

lm = LinearRegression()


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessing_pipeline),  # Preprocessing step
    ('model', lm)
])


pipeline.fit(X_train,y_train)
preds = pipeline.predict(X_test)

score = mean_absolute_error(y_test, preds)
print('MAE:', score)


# final_pipeline = make_pipeline(preprocessing_pipeline,lm)
# cv = cross_validate(final_pipeline,X_train,y_train, cv = 5, return_train_score=True, scoring='neg_mean_absolute_error')



# lm.fit(X_train,y_train)
# lm_scores = np.mean(cross_val_score(lm, X_train, y_train, scoring='neg_mean_absolute_error', cv = 5))


# result.append(pd.DataFrame(cv).mean().to_frame().set_axis([name],axis = 1))
# lm_score = pd.DataFrame(cv).mean().to_frame().set_axis(['LR'],axis = 1)

# print('Linear Regression',lm_score)

# #lasso regression------------------------------
# alpha = []
# mse_lasso = []

# for i in range(1,100):
#     alpha.append(i/100)
#     lm_l = Lasso(alpha=i/100)
#     mse_lasso.append(np.mean(cross_val_score(lm_l, X_train, y_train, scoring='neg_mean_absolute_error', cv = 3)))


# lm_lasso_errors = tuple(zip(alpha,mse_lasso))
# df_lm_lasso_errors = pd.DataFrame(lm_lasso_errors,columns = ['alpha','error'])

# opt_alpha = df_lm_lasso_errors[df_lm_lasso_errors.error == max(df_lm_lasso_errors.error )]
# opt_alpha.reset_index(drop=True, inplace = True)

# lm_lasso = Lasso(opt_alpha['alpha'][0]) 
# lm_lasso.fit(X_train,y_train)
# lm_lasso_scores = np.mean(cross_val_score(lm_lasso, X_train, y_train, scoring='neg_mean_absolute_error', cv = 3))

# print('Lasso Regression',lm_lasso_scores)


# #randomforest------------------------------
# from sklearn.ensemble import RandomForestRegressor

# rf_model = RandomForestRegressor()
# rf_model_score = np.mean(cross_val_score(rf_model, X_train, y_train, scoring='neg_mean_absolute_error', cv = 3))

# print('Random Forest', rf_model_score)


# #tune hyperparameters with gridSearchCV------------------------------
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': range(10,300,10),
#     'criterion': ('squared_error','absolute_error'),
#     'max_features': ('sqrt','log2')
# }


# # Create GridSearchCV object
# grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')

# # Fit the GridSearchCV object to the training data
# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters
# best_params = grid_search.best_params_


# #test ensamble ------------------------------
# y_pred_lm = lm.predict(X_test)
# y_pred_lm_lasso = lm_lasso.predict(X_test)
# y_pred_rf = grid_search.best_estimator_.predict(X_test)


# from sklearn.metrics import mean_absolute_error
# print('Linear Regression mse = ',mean_absolute_error(y_test,y_pred_lm))
# print('Lasso Regression mse = ',mean_absolute_error(y_test,y_pred_lm_lasso))
# print('Random Forest mse = ',mean_absolute_error(y_test,y_pred_rf))
# # print(grid_search.best_estimator_)

# print('Combined Random Forest with lasso mse = ',mean_absolute_error(y_test,(y_pred_rf+y_pred_lm_lasso)/2))

# # import pickle

# # with open('./flaskAPI/models/trained_ml_model.pkl', 'wb') as f:
# #     pickle.dump(grid_search.best_estimator_,f)

# # print("successfully exported model. \n Model details:", grid_search.best_estimator_)

ValueError: Found unknown categories ['Other Retail Stores'] in column 1 during transform