In [342]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

# Creating a dataframe from the dataset
df  = pd.read_csv('survey_results_public.csv')

In [343]:
# Creating a dataframe for specific use case, choosing all applicable columns

df = df[['EdLevel', 'Employment', 'RemoteWork','YearsCode','DevType','Country','CompTotal','AISent','WorkExp']]
df.head()

Unnamed: 0,EdLevel,Employment,RemoteWork,YearsCode,DevType,Country,CompTotal,AISent,WorkExp
0,,,,,,,,,
1,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Employed, full-time",Remote,18.0,"Senior Executive (C-Suite, VP, etc.)",United States of America,285000.0,Indifferent,10.0
2,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Employed, full-time","Hybrid (some remote, some in-person)",27.0,"Developer, back-end",United States of America,250000.0,,23.0
3,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Employed, full-time","Hybrid (some remote, some in-person)",12.0,"Developer, front-end",United States of America,156000.0,,7.0
4,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Employed, full-time;Independent contractor, fr...",Remote,6.0,"Developer, full-stack",Philippines,1320000.0,Very favorable,6.0


In [344]:
# Remvoing the values where the salary is NAN
df = df[df.CompTotal.notnull()]
df.head()

Unnamed: 0,EdLevel,Employment,RemoteWork,YearsCode,DevType,Country,CompTotal,AISent,WorkExp
1,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Employed, full-time",Remote,18,"Senior Executive (C-Suite, VP, etc.)",United States of America,285000.0,Indifferent,10.0
2,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Employed, full-time","Hybrid (some remote, some in-person)",27,"Developer, back-end",United States of America,250000.0,,23.0
3,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Employed, full-time","Hybrid (some remote, some in-person)",12,"Developer, front-end",United States of America,156000.0,,7.0
4,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Employed, full-time;Independent contractor, fr...",Remote,6,"Developer, full-stack",Philippines,1320000.0,Very favorable,6.0
5,Some college/university study without earning ...,"Employed, full-time",Remote,21,"Developer, back-end",United Kingdom of Great Britain and Northern I...,78000.0,Favorable,22.0


In [345]:
# Getting an information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48225 entries, 1 to 89183
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   EdLevel     48225 non-null  object 
 1   Employment  48213 non-null  object 
 2   RemoteWork  48126 non-null  object 
 3   YearsCode   48155 non-null  object 
 4   DevType     48089 non-null  object 
 5   Country     48225 non-null  object 
 6   CompTotal   48225 non-null  float64
 7   AISent      33260 non-null  object 
 8   WorkExp     32723 non-null  float64
dtypes: float64(2), object(7)
memory usage: 3.7+ MB


In [346]:
# Dropping further rows where the value is NAN, since we have sufficient data points, this step can be performed
df = df.dropna()
df.isnull().sum()

EdLevel       0
Employment    0
RemoteWork    0
YearsCode     0
DevType       0
Country       0
CompTotal     0
AISent        0
WorkExp       0
dtype: int64

In [347]:
# Only selecting the values where the employment value is "full time"
df = df[df.Employment == 'Employed, full-time']
df = df.drop("Employment", axis=1)

In [348]:
# Finding the counts for each country
df.Country.value_counts().values

array([4636, 1447, 1291, 1118,  863,  571,  563,  495,  446,  440,  381,
        375,  374,  232,  215,  206,  200,  184,  183,  174,  173,  164,
        158,  153,  149,  144,  144,  137,  134,  134,  133,  129,  115,
        113,  113,  106,   98,   94,   93,   89,   83,   80,   80,   79,
         73,   70,   64,   62,   61,   61,   55,   54,   53,   51,   50,
         48,   46,   45,   40,   40,   40,   37,   35,   35,   34,   30,
         28,   28,   23,   23,   20,   19,   18,   18,   16,   16,   15,
         15,   15,   15,   13,   13,   13,   12,   12,   12,   12,   12,
         11,   11,   10,   10,   10,   10,   10,    9,    9,    9,    8,
          8,    8,    7,    7,    6,    6,    6,    5,    5,    5,    5,
          5,    4,    4,    4,    4,    4,    3,    3,    3,    3,    3,
          3,    3,    3,    3,    3,    3,    2,    2,    2,    2,    2,
          2,    2,    2,    2,    2,    2,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,   

In [349]:
# We determine to drop the countries where data points is 1, or below a certain threshold
def country_threshold(counts, th):
    th_map = {}  
    for i in range(len(counts)):
        if counts.values[i] >= th:
            th_map[counts.index[i]] = counts.index[i]
        else:
            th_map[counts.index[i]] = 'Other'
    return th_map

In [350]:
# The country map returns the new dict with relevant values as per the function
country_map = country_threshold(df.Country.value_counts(), 95)

In [351]:
# The values are mapped to the country column
df['Country'] = df.Country.map(country_map)
df.Country.value_counts().index

Index(['United States of America', 'Other', 'Germany',
       'United Kingdom of Great Britain and Northern Ireland', 'India',
       'Canada', 'Brazil', 'France', 'Spain', 'Australia', 'Netherlands',
       'Poland', 'Sweden', 'Italy', 'Israel', 'Switzerland', 'Portugal',
       'Norway', 'Denmark', 'Turkey', 'Austria', 'Finland', 'Mexico',
       'South Africa', 'Belgium', 'New Zealand', 'Czech Republic', 'Colombia',
       'Pakistan', 'Greece', 'Ukraine', 'Russian Federation', 'Romania',
       'Iran, Islamic Republic of...', 'Ireland', 'Hungary', 'Argentina',
       'Bangladesh'],
      dtype='object')

In [352]:
# Salary expectation is that employed and full-time should be greater than 10,000
df = df[df['CompTotal'] >= 10000]
df.head()

Unnamed: 0,EdLevel,RemoteWork,YearsCode,DevType,Country,CompTotal,AISent,WorkExp
1,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Remote,18,"Senior Executive (C-Suite, VP, etc.)",United States of America,285000.0,Indifferent,10.0
5,Some college/university study without earning ...,Remote,21,"Developer, back-end",United Kingdom of Great Britain and Northern I...,78000.0,Favorable,22.0
6,Some college/university study without earning ...,Remote,4,"Developer, full-stack",United States of America,135000.0,Unfavorable,4.0
7,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Remote,5,"Developer, full-stack",United States of America,80000.0,Favorable,5.0
13,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Hybrid (some remote, some in-person)",5,"Developer, QA or test",United States of America,150000.0,Favorable,10.0


In [353]:
# Performing cleaning for the years of experience, educational level
def exp_clean(x):
    if x == 'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCode'] = df['YearsCode'].apply(exp_clean)

In [354]:
df['EdLevel'].unique()

def clean_ed(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' or 'Associate degree':
        return 'Post grad'
    return 'Less than Bachelors'

df['EdLevel'] = df['EdLevel'].apply(clean_ed)

In [None]:
df = df[]

In [360]:
from sklearn.preprocessing import LabelEncoder
l_edlevel = LabelEncoder()
l_remote = LabelEncoder()
l_years = LabelEncoder()
l_DevType = LabelEncoder()
l_Country  = LabelEncoder()
l_AISent = LabelEncoder()

df['EdLevel'] = l_edlevel.fit_transform(df['EdLevel'])
df['RemoteWork'] = l_remote.fit_transform(df['RemoteWork'])
df['YearsCode'] = l_years.fit_transform(df['YearsCode'])
df['DevType'] = l_DevType.fit_transform(df['DevType'])
df['Country'] = l_Country.fit_transform(df['Country'])
df['AISent'] = l_AISent.fit_transform(df['AISent'])

In [361]:
# Preparing the training and testing datasets
X = df.drop("CompTotal", axis=1)
y = df['CompTotal']

In [362]:
X

Unnamed: 0,EdLevel,RemoteWork,YearsCode,DevType,Country,AISent,WorkExp
1,0,2,18,30,37,1,10.0
5,2,2,21,11,36,0,22.0
6,2,2,4,15,37,2,4.0
7,0,2,5,15,37,0,5.0
13,1,0,5,10,37,0,10.0
...,...,...,...,...,...,...,...
89174,0,1,5,15,25,1,2.0
89175,0,2,10,17,37,0,8.0
89177,1,2,17,15,20,1,12.0
89178,0,2,25,15,37,0,22.0


In [363]:
# Importing the libraies to prepare train and test datasets, linearregression model and mean squared error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
linear_reg = LinearRegression()

# Fitting the training data in the linear regression model
linear_reg.fit(X_train, y_train)

#predicting the values
y_pred = linear_reg.predict(X_test)

In [364]:
# calculting the model's performance
mse = mean_squared_error(y_test, y_pred)
mse

2.36232525326917e+18

In [365]:
# Decicison Tree Regressor
from sklearn.tree import DecisionTreeRegressor
d_reg = DecisionTreeRegressor()
d_reg.fit(X_train, y_train)
y_pred = d_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mse

1.7255518387002576e+17

In [366]:
ND = np.array([['Master’s degree', 'Remote', 4.0,'Project manager',  'India',  'Favorable', 8.0]])
ND

array([['Master’s degree', 'Remote', '4.0', 'Project manager', 'India',
        'Favorable', '8.0']], dtype='<U32')

In [367]:
ND[:, 0] = l_edlevel.transform(ND[:, 0])
ND[:, 1] = l_remote.transform(ND[:, 1])
ND[:, 3] = l_DevType.transform(ND[:, 3])
ND[:, 4] = l_Country.transform(ND[:, 4])
ND[:, 5] = l_AISent.transform(ND[:, 5])

# Ensure the numeric features are correctly typed
ND[:, 2] = ND[:, 2].astype(float)
ND[:, 6] = ND[:, 6].astype(float)

In [368]:
ND_pred = d_reg.predict(ND)
ND_pred



array([1800000.])

In [374]:
# we need to save the model to be able to use it again
import pickle
data = {"model": d_reg, "l_edlevel":l_edlevel,"l_remote":l_remote,"l_years":l_years,"l_DevType":l_DevType, "l_Country":l_Country,"l_AISent":l_AISent}
with open('saved_model.pkl', 'wb') as file:
    pickle.dump(data, file)

In [375]:
# Loading the model to be used again
with open('saved_model.pkl', 'rb') as file:
    data = pickle.load(file)

    
model_loaded = data['model']
l_edlevel=data['l_edlevel']
l_remote=data['l_remote']
l_years=data['l_years']
l_DevType=data['l_DevType']
l_Country=data['l_Country']
l_AISent=data['l_AISent']

In [371]:
data

{'model': LinearRegression(),
 'l_edlevel': LabelEncoder(),
 'l_remote': LabelEncoder(),
 'l_years': LabelEncoder(),
 'l_DevType': LabelEncoder(),
 'l_Country': LabelEncoder(),
 'l_AISent': LabelEncoder()}