In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA
from scipy.sparse import hstack
from scipy.sparse import csr_matrix

In [2]:
data = pd.read_csv('data/salary-train.csv')

In [3]:
data.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [4]:
data['FullDescription'] = data['FullDescription'].str.lower()
data['LocationNormalized'] = data['LocationNormalized'].str.lower()

In [5]:
data['FullDescription'] = data['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

In [6]:
data.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse rgn nursing home for young...,sutton coldfield,,20355


In [7]:
cv = TfidfVectorizer(min_df=5)

In [8]:
cv_fit = cv.fit_transform(data['FullDescription'])

In [9]:
data['ContractTime'].fillna('nan', inplace=True)

In [10]:
new_data = pd.concat([data.loc[:, ['FullDescription', 'SalaryNormalized']], pd.get_dummies(data[['LocationNormalized', 'ContractTime']])], axis='columns')

In [11]:
tmp = csr_matrix(new_data.iloc[:, 2:].values)

In [12]:
x = hstack([tmp, cv_fit])

In [13]:
x.shape

(60000, 24627)

In [14]:
model = Ridge(alpha=1, random_state=241)

In [15]:
model.fit(x, data['SalaryNormalized'])

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [16]:
test = pd.read_csv('data/salary-test-mini.csv')

In [17]:
test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [18]:
# test['FullDescription'] = test['FullDescription'].str.lower()
# test['LocationNormalized'] = test['LocationNormalized'].str.lower()
# test['FullDescription'] = test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
# test = pd.concat([test.loc[:, ['FullDescription', 'SalaryNormalized']], pd.get_dummies(test[['LocationNormalized', 'ContractTime']])], axis='columns')

In [19]:
test.drop('SalaryNormalized', axis=1, inplace=True)

In [20]:
other_locations = pd.DataFrame({i : [0, 0]for i in new_data.columns[2:-3]})

In [21]:
other_locations[[i for i in test.columns if 'Loc' in i]] = [[0, 1], [1, 0]]

KeyError: "['LocationNormalized'] not in index"

In [None]:
test.insert(4, 'ContractTime_nan', 0)

In [None]:
tmp = pd.concat([other_locations.head(2), test.iloc[:, -3:]], axis='columns')
tmp = csr_matrix(tmp.values)

In [None]:
x = cv.transform(test['FullDescription'])

In [None]:
x = hstack([tmp, x])

In [None]:
model.predict(x).round(2)

## Задание 2

In [None]:
data = pd.read_csv('data/close_prices.csv')

In [None]:
data.head()

In [None]:
pca = PCA(n_components=10)

In [None]:
pca.fit(data.iloc[:, 1:])

In [None]:
x = pca.explained_variance_ratio_.copy()

In [None]:
x[:4].sum()

In [None]:
comp1 = pca.transform(data.iloc[:, 1:])[:, 0]

In [None]:
data_djia = pd.read_csv('data/djia_index.csv')

In [None]:
data_djia.head()

In [None]:
np.corrcoef(data_djia['^DJI'], comp1).round(2)

In [None]:
data.columns[np.argmax(pca.components_[0]) + 1]