In [42]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge

In [43]:
train = pd.read_csv('salary-train.csv')
test = pd.read_csv('salary-test-mini.csv')
test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [44]:
def text_modify(data):
    out_data = data.copy()
           
    # Приведем тексты к нижнему регистру
    out_data.FullDescription = out_data.FullDescription.apply(
        lambda text: text.lower()
    )
    
    # Заменим все, кроме букв и цифр, на пробелы — это облегчит дальнейшее разделение текста на слова. 
    out_data['FullDescription'] = out_data['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
    
    # Обработаем миссинги
    out_data['LocationNormalized'].fillna('nan', inplace=True)
    out_data['ContractTime'].fillna('nan', inplace=True)
    
    return out_data
    

In [45]:
train = text_modify(train)
test = text_modify(test)

In [30]:
# Преобразуем тексты в векторы признаков
TF_IDF = TfidfVectorizer(min_df=5)
train_tf_idf = TF_IDF.fit_transform(train.FullDescription)
test_tf_idf = TF_IDF.transform(test.FullDescription)

In [48]:

from sklearn.feature_extraction import DictVectorizer
enc = DictVectorizer()
train_categ = enc.fit_transform(train[['LocationNormalized', 'ContractTime']].to_dict('records'))
test_categ = enc.transform(test[['LocationNormalized', 'ContractTime']].to_dict('records'))

X_train = hstack([train_tf_idf, train_categ])
X_test = hstack([test_tf_idf, test_categ])

In [50]:
ridge_clf = Ridge(alpha=1, random_state=241)
y_train = train.SalaryNormalized
y_test = test.SalaryNormalized

ridge_clf.fit(X=X_train, y=y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [55]:
list(ridge_clf.predict(X=X_test))

[56555.61500154529, 37188.324426177816]

In [59]:
# Запись ответа
output1 = open('week4_1.txt', 'w')
print(
    ' '.join(map(lambda x: str(round(x, 2)), list(ridge_clf.predict(X=X_test)))),
    file=output1, 
    end=''
)
output1.close()