In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
# maybe more imports
from dotenv import load_dotenv
from pymongo import MongoClient
import pandas as pd
import os
import pickle

In [2]:
load_dotenv()
#  get environment variables
mongodb_uri = os.getenv('MONGODB_DB_URL')
db_name = os.getenv('MONGO_DB')
collection_name = os.getenv('MONGO_DB_COLLECTION')

# connect to local mongo instance
client = MongoClient(mongodb_uri)

# select db / create if not existing
db = client[db_name]

# select collection / create if not existing
collection = db[collection_name]

In [3]:
# get data from mongo
data = collection.find({}, projection={'jobRole': True, 'location': True, 'salary': True, '_id': False})

# create a dataframe
df = pd.DataFrame(list(data))
df.head()


Unnamed: 0,jobRole,location,salary
0,Applications developer,Sandrastad,35617
1,"Buyer, retail",West Shaneside,91927
2,"Therapist, occupational",Larsenfort,67077
3,Conference centre manager,Erinton,139492
4,Systems Manager,Patrickburgh,123315


In [4]:
# encode the jobRole and location columns
df = pd.get_dummies(df, columns=['jobRole', 'location'])
display(df)

Unnamed: 0,salary,jobRole_Applications developer,"jobRole_Buyer, retail",jobRole_Conference centre manager,jobRole_Geoscientist,jobRole_HR,"jobRole_Research officer, trade union",jobRole_Retail buyer,jobRole_Retail manager,jobRole_Speech and language therapist,...,location_Erinton,location_Katieside,location_Kevinburgh,location_Lake Cheryl,location_Lake Kathrynfurt,location_Larsenfort,location_New Faith,location_Patrickburgh,location_Sandrastad,location_West Shaneside
0,35617,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,91927,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,67077,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,139492,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,123315,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,62367,False,False,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,False
1996,78618,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
1997,149940,False,False,True,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1998,138519,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [5]:
# create feature and target
X = df.drop('salary', axis=1)
y = df['salary']

In [6]:
# split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# create a linear regression model
model = LinearRegression()

# fit the model
model.fit(X_train, y_train)

In [8]:
# predict the test set
y_pred = model.predict(X_test)

# calculate the root mean squared error
rmse = root_mean_squared_error(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 34184.55933872046


In [9]:
# save encoded columns
with open('columns.pkl', 'wb') as file:
    pickle.dump(X.columns, file)

# save model with pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

## Have to make this into its own file and run right after fill_db