In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn import linear_model

COMPANY = 'Cisco'
POSITION = 'Software Engineer'

salary_data = pd.read_csv('salaryData.csv')
company_data = salary_data[salary_data.company == COMPANY]
company_data = company_data[(company_data.title == POSITION) & (company_data.yearsofexperience <= 5)]

In [2]:
company_data.tail()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber
56441,6/28/2021 16:04:28,Cisco,Grade 6,Software Engineer,97.0,"Durham, NC",3.0,2.0,Security,90.0,0.0,7.0,,Title: Software Engineer,9606,560.0,75766
56597,6/29/2021 19:37:22,Cisco,Grade 4,Software Engineer,87.0,"Seattle, WA",0.0,0.0,Web Development (Front-End),80.0,0.0,7.0,,"Remote, Title: Software Engineer 1",11527,819.0,75969
57000,7/2/2021 10:22:52,Cisco,Grade 10,Software Engineer,200.0,"Durham, NC",4.0,1.0,Networking,145.0,30.0,20.0,,Title: Software Engineer,9606,560.0,76543
57281,7/6/2021 6:39:55,Cisco,Grade 8,Software Engineer,30.0,"Bengaluru, KA, India",5.0,4.0,API Development (Back-End),22.0,4.0,3.0,Female,"Title: Software Engineer, Race: Asian, Academi...",42498,0.0,76927
57769,7/10/2021 17:43:25,Cisco,Grade 8,Software Engineer,189.0,"San Jose, CA",4.0,2.0,Networking,152.0,19.0,18.0,Male,"Title: Software Engineer 3, Race: Asian, Acade...",7422,807.0,77580


# Basic Least Squares Regression
Features:
- Years of experience
- Years at company

In [3]:
company_features = company_data[['yearsofexperience', 'yearsatcompany']]
company_values = company_data['totalyearlycompensation']

In [4]:
lsr = linear_model.LinearRegression()
lsr.fit(company_features, company_values)

LinearRegression()

In [5]:
lsr.coef_

array([15.81819943, -3.75096695])

In [6]:
lsr.predict([(0,0),(1, 1),(1,0)])

array([101.41523199, 113.48246447, 117.23343142])

# Advanced Least Squares Regression
Features:
- Years of experience
- Years at company
- Location transposed indicator
- Level transposed indicator
- Gender transposed indicator

In [7]:
import re

company_features = company_data[['yearsofexperience', 'yearsatcompany']]
company_values = company_data['totalyearlycompensation']

# transpose location
locations = pd.unique(company_data.location)
for loc in locations:
    new_loc = 'location_' + re.sub('[ ]', '_', re.sub('[,]','', loc))
    company_features.insert(0, new_loc, company_data.location == loc)

# transpose level
levels = pd.unique(company_data.level)
for lev in levels:
    new_level = 'level_' + re.sub('[ ]', '_', lev)
    company_features.insert(0, new_level, company_data.level == lev)

# transpose gender
genders = pd.unique(company_data.gender)
for gen in genders:
    new_gender = 'gender_' + re.sub('[ ]', '_', str(gen))
    company_features.insert(0, new_gender, company_data.gender == gen)
    
company_features.tail()

Unnamed: 0,gender_Other,gender_Female,gender_Male,gender_nan,level_G10,level_G4,level_006,level_L2,level_L10,level_New_Grad_,...,location_Raleigh_NC,location_Durham_NC,location_Dallas_TX,location_Boston_MA,location_Milpitas_CA,location_Seattle_WA,location_Vancouver_BC_Canada,location_San_Jose_CA,yearsofexperience,yearsatcompany
56441,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,3.0,2.0
56597,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,0.0,0.0
57000,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,4.0,1.0
57281,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,5.0,4.0
57769,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,4.0,2.0


In [8]:
def transform_input(city, state, country, gender, yoe, yac, level):
    location_val = f'location_{city}_{state}_{country}' if country else f'location_{city}_{state}'
    gender_val = f'gender_{gender}'
    level_val = f'level_{level}'
    
    base_row = company_features.iloc[0].copy(deep=True)
    for col in base_row.index:
        if re.match('gender_|location_|level_', col):
            base_row[col] = False
    base_row[location_val] = True
    base_row[level_val] = True
    base_row[gender_val] = True
    
    base_row['yearsofexperience'] = yoe
    base_row['yearsatcompany'] = yac
    
    return base_row

In [9]:
lsr = linear_model.LinearRegression()
lsr.fit(company_features, company_values)
lsr.coef_

array([-1.78924168e+01, -8.82362045e+00,  4.83416640e-01, -1.63442245e+11,
        1.93263689e+12,  1.93263689e+12,  1.93263689e+12,  1.93263689e+12,
        1.93263689e+12,  1.93263689e+12,  1.93263689e+12,  1.93263689e+12,
        1.93263689e+12,  1.93263689e+12,  1.93263689e+12,  1.93263689e+12,
        1.02304289e+12,  1.93263689e+12,  1.93263689e+12,  1.93263689e+12,
        1.93263689e+12,  1.93263689e+12,  1.93263689e+12,  1.93263689e+12,
        1.93263689e+12,  1.93263689e+12,  1.93263689e+12,  1.93263689e+12,
        1.93263689e+12,  1.93263689e+12,  1.93263689e+12,  1.93263689e+12,
        1.93263689e+12, -8.96164441e+11, -8.96164441e+11, -8.96164441e+11,
       -8.96164441e+11, -8.96164441e+11, -8.96164441e+11, -8.96164441e+11,
        1.34295613e+10, -8.96164441e+11, -8.96164441e+11, -8.96164441e+11,
       -8.96164441e+11, -8.96164441e+11, -8.96164441e+11, -8.96164441e+11,
       -8.96164441e+11, -8.96164441e+11, -8.96164441e+11, -8.96164441e+11,
       -8.96164441e+11, -

In [10]:
c1 = transform_input('San_Jose', 'CA', None, 'Male', 10, 10, 'L10')
c2 = transform_input('San_Jose', 'CA', None, 'Male', 1, 1, 'L6')
#c1.index
lsr.predict([c1, c2])

  return f(*args, **kwargs)


array([182.33532715, 128.93347168])

In [11]:
lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(company_features, company_values)

Lasso(alpha=0.1)

In [12]:
lasso.predict([c1,c2])

  return f(*args, **kwargs)


array([179.71124508, 137.65975897])

In [13]:
bayes = linear_model.BayesianRidge()
bayes.fit(company_features, company_values)

BayesianRidge()

In [14]:
bayes.predict([c1,c2])

  return f(*args, **kwargs)


array([184.85947152, 131.75077806])