In [136]:
# Belajar membuat multiple prediksi menggunkan scikit learn
# fungsi word2number adalah mengubah string menjadi angka

import pandas as pd 
from sklearn import linear_model
from word2number import w2n

df = pd.read_csv("hiring.csv")
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [137]:
# mengubah nama kolom agar lebih mudah di baca

df = df.rename(columns={
    "test_score(out of 10)" : "score",
    "interview_score(out of 10)" : "interview",
    "salary($)" : "salary"
        })
df

Unnamed: 0,experience,score,interview,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [138]:
# mencari nilai yang kosong pada kolom score dengan mencari median

import math
new_score = math.floor(df["score"].median())
new_score

8

In [139]:
# lalu menambahkan nilai yang baru tersebut menggunakan metode fillna

df["score"] = df["score"].fillna(new_score)
df

Unnamed: 0,experience,score,interview,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


In [140]:
df["experience"]

0       NaN
1       NaN
2      five
3       two
4     seven
5     three
6       ten
7    eleven
Name: experience, dtype: object

In [141]:
# menambahkan nilai yang kosong tersebut menjadi zero menggunakan metode fillna

df = df.fillna("zero")
df

Unnamed: 0,experience,score,interview,salary
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,8.0,7,72000
7,eleven,7.0,8,80000


In [142]:
# mengubah nilai string pada kolom experience menjadi angka menggunakan metode dari library word2number

df["experience"] = df["experience"].apply(w2n.word_to_num)
df

Unnamed: 0,experience,score,interview,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


In [143]:
# mengubah nilai pada kolom score dari float menjadi int menggunakan metode astype

df["score"] = df["score"].astype(int)
df

Unnamed: 0,experience,score,interview,salary
0,0,8,9,50000
1,0,8,6,45000
2,5,6,7,60000
3,2,10,10,65000
4,7,9,6,70000
5,3,7,10,62000
6,10,8,7,72000
7,11,7,8,80000


In [144]:
# membuat model prediksi nya

model = linear_model.LinearRegression()
model.fit(df[["experience", "score", "interview"]], df["salary"])

In [145]:
# setelah membuat prediksinya kita membuat dataframe baru untuk ditambahkan ke dataframe lama kita

a = model.predict([[2, 9, 6]])
b = model.predict([[12,10,10]])

final = pd.DataFrame({
     "experience" : [2, 12],
     "score"      : [9, 10],
     "interview"  : [6,10], 
     "salary"     : [a,b]
})

df = pd.concat([df,final], ignore_index=True)
df



Unnamed: 0,experience,score,interview,salary
0,0,8,9,50000
1,0,8,6,45000
2,5,6,7,60000
3,2,10,10,65000
4,7,9,6,70000
5,3,7,10,62000
6,10,8,7,72000
7,11,7,8,80000
8,2,9,6,[53205.96797671032]
9,12,10,10,[92002.18340611355]


In [146]:
# lalu kita ubah kolom salary menjadi int karena ada salah satu data yang float

df["salary"] = df["salary"].astype(int)
df

Unnamed: 0,experience,score,interview,salary
0,0,8,9,50000
1,0,8,6,45000
2,5,6,7,60000
3,2,10,10,65000
4,7,9,6,70000
5,3,7,10,62000
6,10,8,7,72000
7,11,7,8,80000
8,2,9,6,53205
9,12,10,10,92002
