In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('explored_data_for_model.csv')
to_drop = [col for col in df.columns if 'unnamed' in col.lower()]

In [22]:
df = df.drop(columns=to_drop)

In [23]:
features = ['Rating','Size','Type of ownership','Industry', 'Sector', 'Revenue','num_comp','job_state','same_state_as_hq','company_text',
            'company_age','python_yn', 'spark_yn', 'cloud_yn','deployments_yn','viz_tools_yn', 'api_dev_yn','job_title_simplified', 'seniority','jd_length']

In [24]:
#building preprocessing pipeline
numerical_features = df[features].select_dtypes(include=np.number).columns.to_list()
categorical_features = df[features].select_dtypes(include='object').columns.to_list()
bool_features = [x for x in features if x not in numerical_features and x not in categorical_features]
df[bool_features] = df[bool_features].astype(int) #Converting bool features to int

numerical_features = numerical_features + bool_features

In [25]:
nominal = ['Size', 'Revenue']
ordinal = [x for x in categorical_features if x not in nominal]

In [26]:
# Splitting dataset
y = df['avg_salary']
X = df[features]

In [27]:
# Creating Encoding pipelines for different types of variables
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer, MinMaxScaler, PowerTransformer, KBinsDiscretizer, StandardScaler
from sklearn.compose import ColumnTransformer


ordinal_pipeline = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)                  #categories=[['Yes','No']]*len(ordinal)), MinMaxScaler()
nominal_pipeline = OneHotEncoder(drop='first', handle_unknown = 'ignore')
numeric_pipeline = make_pipeline(PowerTransformer('yeo-johnson'), MinMaxScaler())


preprocessing_pipeline = ColumnTransformer(transformers=[
    ('ordinal_pipeline',ordinal_pipeline,ordinal),
    ('nominal_pipeline',nominal_pipeline,nominal),
    ('numeric_pipeline',numeric_pipeline,numerical_features)
])




In [28]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn import tree

models= {
    'LR':LinearRegression(),
    'RF':RandomForestRegressor(),           #try 'XGB': XGBClassifier(),
    'DTregressor': tree.DecisionTreeRegressor(),
    'svm':svm.SVR(),
    'Lasso':Lasso(alpha=0.5)
    }

result = []
for name,model in models.items():
    final_pipeline = make_pipeline(preprocessing_pipeline,model)
    cv = cross_validate(final_pipeline,X,y, cv = 5, return_train_score=True, scoring='neg_mean_absolute_error')
    result.append(pd.DataFrame(cv).mean().to_frame().set_axis([name],axis = 1))

scores = pd.concat(result, axis = 1)



In [29]:
scores

Unnamed: 0,LR,RF,DTregressor,svm,Lasso
fit_time,0.02461,0.229471,0.014875,0.021979,0.024604
score_time,0.008144,0.005887,0.003868,0.009151,0.006695
test_score,-24.609336,-13.966046,-15.389806,-29.312936,-24.359042
train_score,-22.441533,-4.853178,0.0,-29.088016,-23.194374


In [30]:
# hyperparameter tuning for the best model - RF

from sklearn.model_selection import GridSearchCV

#selecting best model:
model = RandomForestRegressor()
final_pipeline = make_pipeline(preprocessing_pipeline,model)
param_grid = {
    'randomforestregressor__n_estimators': range(10,100,10),#    'criterion': ('squared_error','absolute_error'),
    'randomforestregressor__max_features': ('sqrt','log2'),
    'randomforestregressor__min_samples_split': range(5,10,1),
    'randomforestregressor__ccp_alpha': np.arange(0, 1.1, 0.1, dtype=float)
}

grid_search = GridSearchCV(final_pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X,y)



In [41]:
model = grid_search.best_estimator_

import pickle

model_file_path =  '/Users/sudhanshuranjan/Documents/dataScience_salaries/flaskAPI/models/RF_model_pipelined.pkl'

with open(model_file_path, 'wb') as file:
    pickle.dump(model, file)



In [54]:
df_tojson = X[:1]

json_data = df_tojson.to_json()

In [55]:
json_data

'{"Rating":{"0":3.8},"Size":{"0":"501 to 1000 employees"},"Type of ownership":{"0":"Company - Private"},"Industry":{"0":"Aerospace & Defense"},"Sector":{"0":"Aerospace & Defense"},"Revenue":{"0":"$50 to $100 million (USD)"},"num_comp":{"0":0},"job_state":{"0":"NM"},"same_state_as_hq":{"0":0},"company_text":{"0":"Tecolote Research"},"company_age":{"0":51},"python_yn":{"0":1},"spark_yn":{"0":0},"cloud_yn":{"0":0},"deployments_yn":{"0":0},"viz_tools_yn":{"0":1},"api_dev_yn":{"0":0},"job_title_simplified":{"0":"data scientist"},"seniority":{"0":"na"},"jd_length":{"0":2555}}'

In [58]:
json_Df = pd.read_json(json_data)

In [59]:
json_Df

Unnamed: 0,Rating,Size,Type of ownership,Industry,Sector,Revenue,num_comp,job_state,same_state_as_hq,company_text,company_age,python_yn,spark_yn,cloud_yn,deployments_yn,viz_tools_yn,api_dev_yn,job_title_simplified,seniority,jd_length
0,3.8,501 to 1000 employees,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),0,NM,0,Tecolote Research,51,1,0,0,0,1,0,data scientist,na,2555


In [45]:
import json
prediction = model.predict(X[:1])[0]
Json_val = json.dumps({'response': prediction})

In [48]:
result = Json_val.json()
int(result['response'])

AttributeError: 'str' object has no attribute 'json'

In [None]:
grid_search.best_params_

{'randomforestregressor__ccp_alpha': 0.1,
 'randomforestregressor__max_features': 'sqrt',
 'randomforestregressor__min_samples_split': 5,
 'randomforestregressor__n_estimators': 40}