In [10]:
import numpy as np
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer 
from scipy.sparse import hstack
from sklearn.linear_model import Ridge


In [2]:
df = pd.read_csv('salary-train.csv')

In [38]:
def text_transformation(text):
    return text.str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)

In [39]:
vec_feat = TfidfVectorizer(min_df = 5)

In [40]:
X_train_text = vec_feat.fit_transform(text_transformation(df['FullDescription']))

In [41]:
df.isnull().sum()

FullDescription       0
LocationNormalized    0
ContractTime          0
SalaryNormalized      0
dtype: int64

In [42]:
df['ContractTime'].fillna('Nan', inplace = True)
df['LocationNormalized'].fillna('Nan', inplace = True)

In [43]:
dic_vec = DictVectorizer()
X_train_dic = dic_vec.fit_transform(df[['LocationNormalized', 'ContractTime']].to_dict('records'))


In [44]:
X_train = hstack([X_train_text, X_train_dic])


In [45]:
y_train = df['SalaryNormalized']

In [46]:
mdl = Ridge(alpha = 1, random_state = 241)
mdl.fit(X_train, y_train)

Ridge(alpha=1, random_state=241)

In [48]:
test = pd.read_csv('salary-test-mini.csv')
X_test_text = vec_feat.transform(text_transformation(test['FullDescription']))
X_test_dic = dic_vec.transform(test[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test = hstack([X_test_text, X_test_dic])
y_test = mdl.predict(X_test)
print(f"{y_test[0]:.2f} {y_test[1]:.2f}")

56579.28 37135.31
