In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("survey_results_public_new.csv")

In [2]:
df.head()

Unnamed: 0,Degree,Experience_Years,Languages,Salary
0,Bachelor's degree,21,"C++, Ruby",1562928
1,PhD,1,"Java, C++",1861572
2,PhD,28,"C++, PHP",2525904
3,Bachelor's degree,30,"JavaScript, Python",1766100
4,Less than a Bachelors,22,"Python, C++",1181244


In [3]:
df = df[["Degree","Experience_Years","Languages","Salary"]]
# df = df.rename({"Salary2": "Salary"}, axis=1)
df.head()

Unnamed: 0,Degree,Experience_Years,Languages,Salary
0,Bachelor's degree,21,"C++, Ruby",1562928
1,PhD,1,"Java, C++",1861572
2,PhD,28,"C++, PHP",2525904
3,Bachelor's degree,30,"JavaScript, Python",1766100
4,Less than a Bachelors,22,"Python, C++",1181244


In [4]:
df = df[df["Salary"].notnull()]
df.head()

Unnamed: 0,Degree,Experience_Years,Languages,Salary
0,Bachelor's degree,21,"C++, Ruby",1562928
1,PhD,1,"Java, C++",1861572
2,PhD,28,"C++, PHP",2525904
3,Bachelor's degree,30,"JavaScript, Python",1766100
4,Less than a Bachelors,22,"Python, C++",1181244


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Degree            2000 non-null   object
 1   Experience_Years  2000 non-null   int64 
 2   Languages         2000 non-null   object
 3   Salary            2000 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 62.6+ KB


In [6]:
df = df.dropna()
df.isnull().sum()

Degree              0
Experience_Years    0
Languages           0
Salary              0
dtype: int64

In [7]:
df["Experience_Years"].unique()

array([21,  1, 28, 30, 22, 32, 41, 12, 20, 42, 13,  0, 44, 45,  9,  8, 36,
        4, 17, 47, 37,  7, 33, 18, 50, 31, 34, 35, 40,  6, 16, 48,  3, 29,
       23, 25, 11,  5, 19, 49, 15, 38, 14, 24,  2, 46, 26, 27, 43, 10, 39],
      dtype=int64)

In [8]:
def clean_experience(x):
    if x ==  'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['Experience_Years'] = df['Experience_Years'].apply(clean_experience)

In [9]:
df["Degree"].unique()

array(["Bachelor's degree", 'PhD', 'Less than a Bachelors',
       "Master's degree"], dtype=object)

In [10]:
def clean_education(x):
    if 'Bachelor' in x:
        return 'Bachelors'
    if 'Master' in x:
        return 'Masters'
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    return 'Less than a Bachelors'


# df['Degree'] = df['Degree'].apply(clean_education)

In [11]:
df["Degree"].unique()

array(["Bachelor's degree", 'PhD', 'Less than a Bachelors',
       "Master's degree"], dtype=object)

In [12]:
from sklearn.preprocessing import LabelEncoder
le_education = LabelEncoder()
Years = LabelEncoder()
lang = LabelEncoder()
df['Degree'] = le_education.fit_transform(df['Degree'])
df["Degree"].unique()
df['Experience_Years'] = Years.fit_transform(df['Experience_Years'])
df["Experience_Years"].unique()
df['Languages'] = lang.fit_transform(df['Languages'])
df["Languages"].unique()

array([ 2,  5,  1, 13, 20,  6,  4, 22, 11, 10,  8,  9, 12, 14, 26, 17, 16,
       27,  3, 23, 15, 24, 18, 19,  0, 25, 21,  7])

In [13]:
from sklearn.preprocessing import LabelEncoder

# Assuming your DataFrame is named 'df'
le_education = LabelEncoder()
le_education.fit(["Less than a Bachelors", "Bachelors", "Masters", "Post grad"])


In [14]:
X = df.drop("Salary", axis=1).values
y = df["Salary"]

In [15]:
print(df.dtypes)

Degree              int32
Experience_Years    int64
Languages           int32
Salary              int64
dtype: object


In [16]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X,y.values)

In [17]:
y_pred = linear_reg.predict(X)

In [18]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y, y_pred))

In [19]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X, y.values)

In [20]:
y_pred = dec_tree_reg.predict(X)

In [21]:
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

$0.00


In [22]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(X, y.values)

In [23]:
y_pred = random_forest_reg.predict(X)

In [24]:
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

$15,366.85


In [25]:
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(X, y.values)

In [26]:
regressor = gs.best_estimator_

regressor.fit(X, y.values)
y_pred = regressor.predict(X)
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

$38,095.97


In [27]:
X

array([[ 0, 21,  2],
       [ 3,  1,  5],
       [ 3, 28,  1],
       ...,
       [ 0, 39,  9],
       [ 1, 43, 15],
       [ 0,  2,  1]], dtype=int64)

In [28]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
regressor.fit(X_train, y_train)
regressor.fit(X_train, y_train)
y_pred_test = regressor.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)
print(f"Test R-squared Score: {r2_test * 100:.2f}%")
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
mean_y_test = np.mean(y_test)
rmse_percentage = (rmse_test / mean_y_test) * 100
print(f"Test RMSE: {rmse_percentage:.2f}% of the average salary")

Test R-squared Score: 99.43%
Test RMSE: 2.37% of the average salary


In [29]:
import pickle

In [30]:
data = {"model": regressor, "Degree": le_education, "Experience_Years": Years,"Languages": lang}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [31]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
# le_country = data["le_country"]
le_education = data["Degree"]
Years = data["Experience_Years"]
lang = data["Languages"]

In [32]:
y_pred = regressor.predict(X)
y_pred

array([1524070.2       , 1907270.51162791, 2532988.        , ...,
       1925278.76923077, 1668311.46666667, 1089469.5       ])