In [88]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [89]:
# word2num package can be used for the same
def parse_int(textnum, numwords={}):
    # create our default word-lists
    if not numwords:

      # singles
      units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
      ]

      # tens
      tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

      # larger scales
      scales = ["hundred", "thousand", "million", "billion", "trillion"]

      # divisors
      numwords["and"] = (1, 0)

      # perform our loops and start the swap
      for idx, word in enumerate(units):    numwords[word] = (1, idx)
      for idx, word in enumerate(tens):     numwords[word] = (1, idx * 10)
      for idx, word in enumerate(scales):   numwords[word] = (10 ** (idx * 3 or 2), 0)

    # primary loop
    current = result = 0
    # loop while splitting to break into individual words
    for word in textnum.replace("-"," ").split():
        # if problem then fail-safe
        if word not in numwords:
          raise Exception("Illegal word: " + word)

        # use the index by the multiplier
        scale, increment = numwords[word]
        current = current * scale + increment
        
        # if larger than 100 then push for a round 2
        if scale > 100:
            result += current
            current = 0

    # return the result plus the current
    return result + current

In [90]:
df = pd.read_csv('./hr.csv')
df

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [91]:
# filling empty cells in experience column
df.experience = df.experience.fillna('zero')
# df.experience = [parse_int(x) for x in df['experience']]
df.experience = df.experience.apply(parse_int)
df

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,,7,72000
7,11,7.0,8,80000


In [92]:
import math
testScoreMedian = math.floor(df.test_score.mean())
df.test_score = df.test_score.fillna(testScoreMedian);
df

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.0,7,72000
7,11,7.0,8,80000


In [93]:
linearRegression = LinearRegression()
linearRegression.fit(df[['experience', 'test_score', 'interview_score']], df.salary)

LinearRegression()

In [94]:
linearRegression.predict([[2, 9, 6], [12, 10, 10]])

array([53713.86677124, 93747.79628651])

In [95]:
print({
    "coeff": linearRegression.coef_,
    "intercept": linearRegression.intercept_
})

{'coeff': array([2922.26901502, 2221.30909959, 2147.48256637]), 'intercept': 14992.65144669314}
