In [2]:
! pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [3]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack

In [4]:
df = pd.read_csv('data.csv')

In [7]:
df

Unnamed: 0,title,city,marker_icon,workplace_type,company_name,experience_level,skills,contract_type,from,to
0,Mid iOS Developer,Wrocław,mobile,remote,CRED,mid,iOS Combine Swift,b2b,20000.0,26000.0
1,PHP Web Developer - Symfony,Warszawa,php,remote,digatus,mid,API PHP,b2b,15000.0,23000.0
2,PHP Web Developer - Symfony,Warszawa,php,remote,digatus,mid,API PHP,permanent,13000.0,20000.0
3,IT Security Engineer,Warszawa,security,partly_remote,CloudFerro sp. z o.o.,mid,Linux Cybersecurity SIEM,permanent,13000.0,18000.0
4,IT Security Engineer,Warszawa,security,partly_remote,CloudFerro sp. z o.o.,mid,Linux Cybersecurity SIEM,b2b,13000.0,18000.0
...,...,...,...,...,...,...,...,...,...,...
12001,Adobe Developer,Wrocław,other,remote,Experis Manpower Group,senior,Adobe Campaign Adobe Target,b2b,26880.0,30240.0
12002,Mid Python Developer,Wrocław,python,remote,sky gate,mid,Django GitFLow Git,b2b,8000.0,16000.0
12003,Mid Flutter Developer,Gdańsk,mobile,partly_remote,sky gate,mid,REST API GitHub Flutter,permanent,9000.0,18000.0
12004,Mid React Developer,Gdańsk,javascript,remote,sky gate,mid,GitHub React JavaScript,permanent,9000.0,18000.0


In [5]:
cat_features = [
    'city',
    'marker_icon',
    'workplace_type',
    'company_name',
    'experience_level',
    'contract_type'
]

text_features = [
    'title',
    'skills'
]

In [6]:
X, y_from, y_to = df.drop(['from', 'to'], axis=1), df['from'], df['to']

In [10]:
X.shape

(12006, 8)

In [11]:
y_from.shape

(12006,)

In [5]:
X_from_train, X_from_valid, y_from_train, y_from_valid = train_test_split(X, y_from, test_size=0.2, random_state=42)

In [6]:
X_to_train, X_to_valid, y_to_train, y_to_valid = train_test_split(X, y_to, test_size=0.2, random_state=42)

In [14]:
X_from_train.head()

Unnamed: 0,title,city,marker_icon,workplace_type,company_name,experience_level,skills,contract_type
7623,Cloud Network Engineer in Security,Gdańsk,security,remote,Codilime,senior,Public Clouds Networks Firewall,b2b
6118,Analityk Danych,Sopot,analytics,office,Baukrane Sp. z o. o.,mid,Power BI power qu SQL,permanent
2755,Regular QA Specialist,Poznań,testing,remote,Merixstudio,mid,Test Automation Web Applications Manual Testing,b2b
6431,DevOps Engineer (AWS/pharma),Łódź,devops,remote,7N,senior,Terraform CI/CD Kubernetes,b2b
288,Senior Java Cloud Developer,Warszawa,java,partly_remote,DahliaMatic Sp. z o.o.,senior,IT studies Java Polish,b2b


# Train models without text features

## Catboost

In [117]:
# one-hot max size
model = CatBoostRegressor(iterations=10000, cat_features=cat_features, verbose=500)

In [118]:
model.fit(X_from_train.drop(['title', 'skills'], axis=1), y_from_train, eval_set=(X_from_valid.drop(['title', 'skills'], axis=1), y_from_valid))

Learning rate set to 0.01783
0:	learn: 6348.6539752	test: 6138.3083520	best: 6138.3083520 (0)	total: 37.3ms	remaining: 6m 12s
500:	learn: 3153.6980013	test: 2897.5014702	best: 2897.5014702 (500)	total: 4.72s	remaining: 1m 29s
1000:	learn: 2837.8061577	test: 2670.1119744	best: 2670.1119744 (1000)	total: 8.83s	remaining: 1m 19s
1500:	learn: 2671.9482062	test: 2564.0570780	best: 2564.0570780 (1500)	total: 14.5s	remaining: 1m 21s
2000:	learn: 2554.0324965	test: 2495.4932826	best: 2495.4445325 (1999)	total: 18.5s	remaining: 1m 13s
2500:	learn: 2456.8759361	test: 2442.2758135	best: 2442.2758135 (2500)	total: 22.6s	remaining: 1m 7s
3000:	learn: 2369.5927369	test: 2392.2293679	best: 2392.2293679 (3000)	total: 28.1s	remaining: 1m 5s
3500:	learn: 2294.3006641	test: 2352.1104219	best: 2352.1104219 (3500)	total: 32.2s	remaining: 59.7s
4000:	learn: 2227.4802116	test: 2325.4105105	best: 2325.2266606 (3991)	total: 37.7s	remaining: 56.5s
4500:	learn: 2168.8492483	test: 2298.3561528	best: 2298.3561528 

<catboost.core.CatBoostRegressor at 0x7f0fa37677f0>

In [119]:
pd.DataFrame({'feature_name': model.feature_names_, 'feature_importance': model.feature_importances_})

Unnamed: 0,feature_name,feature_importance
0,city,3.655172
1,marker_icon,20.311147
2,workplace_type,6.825231
3,company_name,33.177337
4,experience_level,23.450636
5,contract_type,12.580476


In [120]:
y_from_pred = model.predict(X_from_valid.drop(['title', 'skills'], axis=1))

In [121]:
pd.DataFrame({'y_from_pred': y_from_pred, 'y_from_valid': y_from_valid})

Unnamed: 0,y_from_pred,y_from_valid
7740,20542.071464,21000.0
10643,9696.035126,8500.0
9351,23720.049273,25000.0
7872,18065.290277,17000.0
360,16463.185653,20800.0
...,...,...
5786,25688.330406,28000.0
1195,23780.730908,36000.0
7606,18441.998702,17000.0
7628,11062.075455,11000.0


In [122]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

2140.6377043720227

## RF

In [7]:
X_ohe = pd.get_dummies(X.drop(['title', 'skills'], axis=1))

In [8]:
X_ohe.head()

Unnamed: 0,city_96-321 Musuły,city_Aleksandrów Łódzki,city_Andrespol,city_Augustów,city_Balice,city_Berlin,city_Białystok,city_Białystok.1,city_Bielany Wrocławskie,city_Bielsko-Biała,...,company_name_summ-it,company_name_superdevs,company_name_travactory N.V.,company_name_Świat Kwiatów,experience_level_junior,experience_level_mid,experience_level_senior,contract_type_b2b,contract_type_mandate_contract,contract_type_permanent
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


In [131]:
X_ohe_from_train, X_ohe_from_valid, y_from_train, y_from_valid = train_test_split(X_ohe, y_from, test_size=0.2, random_state=42)

In [132]:
rf = RandomForestRegressor(n_estimators=300)

In [135]:
rf.fit(X_ohe_from_train, y_from_train)

RandomForestRegressor(n_estimators=300)

In [137]:
y_from_pred = rf.predict(X_ohe_from_valid)

In [138]:
pd.DataFrame({'y_from_pred': y_from_pred, 'y_from_valid': y_from_valid})

Unnamed: 0,y_from_pred,y_from_valid
7740,20169.222222,21000.0
10643,8500.000000,8500.0
9351,24930.000000,25000.0
7872,16646.500000,17000.0
360,12448.333333,20800.0
...,...,...
5786,28000.000000,28000.0
1195,21585.316667,36000.0
7606,16879.000000,17000.0
7628,11061.266667,11000.0


In [139]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

1957.481172859356

## LinearRegression

In [166]:
lr = Lasso(alpha=0.1)

In [167]:
lr.fit(X_ohe_from_train, y_from_train)

Lasso(alpha=0.1)

In [168]:
y_from_pred = lr.predict(X_ohe_from_valid)

In [169]:
pd.DataFrame({'y_from_pred': y_from_pred, 'y_from_valid': y_from_valid})

Unnamed: 0,y_from_pred,y_from_valid
7740,23394.789451,21000.0
10643,8580.594817,8500.0
9351,20325.354301,25000.0
7872,15961.225001,17000.0
360,21167.939504,20800.0
...,...,...
5786,27933.569782,28000.0
1195,25310.270911,36000.0
7606,15929.931048,17000.0
7628,13734.344775,11000.0


In [170]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

2618.9630043723346

# Train models with text features

## Get sparse matrices


In [9]:
vectorizer = TfidfVectorizer()

In [10]:
skills_matrix = vectorizer.fit_transform(df.skills)

In [11]:
skills_matrix

<12006x885 sparse matrix of type '<class 'numpy.float64'>'
	with 42232 stored elements in Compressed Sparse Row format>

In [12]:
vectorizer.get_feature_names_out()

array(['10', '11', '13', '14', '15', '16', '17', '18', '360', '365', '3d',
       '4g', '4hana', '5g', 'aad', 'abap', 'access', 'active', 'adas',
       'adf', 'administration', 'adobe', 'agile', 'ai', 'airflow', 'ajax',
       'akeneo', 'akka', 'aks', 'algorithms', 'alto', 'amazon',
       'analityczne', 'analityka', 'analiza', 'analysis', 'analytical',
       'analytics', 'and', 'android', 'angielski', 'angular', 'angular12',
       'angularjs', 'ansible', 'any', 'apache', 'apex', 'api', 'apis',
       'aplication', 'app', 'appium', 'application', 'applications',
       'apps', 'aqa', 'archimate', 'architect', 'architecture', 'arm',
       'articulate', 'artifactory', 'aruba', 'as', 'as400', 'asp',
       'asset', 'assurance', 'async', 'atlassian', 'aurora',
       'authorization', 'automate', 'automated', 'automation', 'autosar',
       'av', 'avalonia', 'await', 'aws', 'axiom', 'axiomsl', 'axure',
       'azure', 'b2', 'backend', 'background', 'backlog', 'banking',
       'bash', '

In [13]:
title_matrix = vectorizer.fit_transform(df.title)

In [14]:
title_matrix

<12006x652 sparse matrix of type '<class 'numpy.float64'>'
	with 38269 stored elements in Compressed Sparse Row format>

In [15]:
vectorizer.get_feature_names_out()

array(['100', '2nd', '32420', '32424', '365', '3d', '3rd', '5g', 'aad',
       'abap', 'access', 'account', 'active', 'address', 'admin',
       'administrator', 'adobe', 'aem', 'agency', 'agile', 'ai',
       'akademia', 'akeneo', 'akka', 'algorithmic', 'all', 'analityczka',
       'analityk', 'analyst', 'analytics', 'and', 'android', 'androidów',
       'angular', 'aosp', 'api', 'aplikacji', 'app', 'application',
       'applications', 'apps', 'aps', 'aqa', 'architect', 'architecture',
       'architekt', 'articulate', 'asap', 'asp', 'asset', 'assistant',
       'associate', 'assurance', 'at', 'atlassian', 'auditor', 'automate',
       'automation', 'automatyzujacy', 'automatyzujący', 'automotive',
       'autor', 'autosar', 'aws', 'ax', 'axiomsl', 'azure', 'b2b', 'back',
       'backend', 'background', 'banking', 'based', 'basis', 'baz', 'be',
       'bez', 'bezpieczeństwa', 'bi', 'big', 'billing', 'bioinformatics',
       'biopharma', 'biotech', 'biznesowo', 'biznesowy', 'biznesowy

In [16]:
text_sparse_matrix = hstack((skills_matrix, title_matrix))

In [17]:
text_sparse_matrix

<12006x1537 sparse matrix of type '<class 'numpy.float64'>'
	with 80501 stored elements in COOrdinate format>

In [45]:
X.drop(['title', 'skills'], axis=1)

Unnamed: 0,city,marker_icon,workplace_type,company_name,experience_level,contract_type
0,Wrocław,mobile,remote,CRED,mid,b2b
1,Warszawa,php,remote,digatus,mid,b2b
2,Warszawa,php,remote,digatus,mid,permanent
3,Warszawa,security,partly_remote,CloudFerro sp. z o.o.,mid,permanent
4,Warszawa,security,partly_remote,CloudFerro sp. z o.o.,mid,b2b
...,...,...,...,...,...,...
12001,Wrocław,other,remote,Experis Manpower Group,senior,b2b
12002,Wrocław,python,remote,sky gate,mid,b2b
12003,Gdańsk,mobile,partly_remote,sky gate,mid,permanent
12004,Gdańsk,javascript,remote,sky gate,mid,permanent


In [46]:
pd.DataFrame(text_sparse_matrix.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1527,1528,1529,1530,1531,1532,1533,1534,1535,1536
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
X_text = pd.DataFrame(np.hstack((X.drop(['title', 'skills'], axis=1), text_sparse_matrix.toarray())))

In [60]:
X_text

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1533,1534,1535,1536,1537,1538,1539,1540,1541,1542
0,Wrocław,mobile,remote,CRED,mid,b2b,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Warszawa,php,remote,digatus,mid,b2b,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Warszawa,php,remote,digatus,mid,permanent,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Warszawa,security,partly_remote,CloudFerro sp. z o.o.,mid,permanent,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Warszawa,security,partly_remote,CloudFerro sp. z o.o.,mid,b2b,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12001,Wrocław,other,remote,Experis Manpower Group,senior,b2b,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12002,Wrocław,python,remote,sky gate,mid,b2b,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12003,Gdańsk,mobile,partly_remote,sky gate,mid,permanent,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12004,Gdańsk,javascript,remote,sky gate,mid,permanent,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
dict(zip(range(0, 6), cat_features))

{0: 'city',
 1: 'marker_icon',
 2: 'workplace_type',
 3: 'company_name',
 4: 'experience_level',
 5: 'contract_type'}

In [63]:
X_text.rename(columns=dict(zip(range(0, 6), cat_features)), inplace=True)

In [64]:
X_text

Unnamed: 0,city,marker_icon,workplace_type,company_name,experience_level,contract_type,6,7,8,9,...,1533,1534,1535,1536,1537,1538,1539,1540,1541,1542
0,Wrocław,mobile,remote,CRED,mid,b2b,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Warszawa,php,remote,digatus,mid,b2b,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Warszawa,php,remote,digatus,mid,permanent,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Warszawa,security,partly_remote,CloudFerro sp. z o.o.,mid,permanent,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Warszawa,security,partly_remote,CloudFerro sp. z o.o.,mid,b2b,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12001,Wrocław,other,remote,Experis Manpower Group,senior,b2b,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12002,Wrocław,python,remote,sky gate,mid,b2b,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12003,Gdańsk,mobile,partly_remote,sky gate,mid,permanent,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12004,Gdańsk,javascript,remote,sky gate,mid,permanent,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
X_text_from_train, X_text_from_valid, y_from_train, y_from_valid = train_test_split(X_text, y_from, test_size=0.2, random_state=42)

In [20]:
X_text_to_train, X_text_to_valid, y_to_train, y_to_valid = train_test_split(X_text, y_to, test_size=0.2, random_state=42)

## Catboost

In [76]:
# regularization and other hyperparams
model = CatBoostRegressor(iterations=1500, cat_features=cat_features, verbose=100)

In [77]:
model.fit(X_text_from_train, y_from_train, eval_set=(X_text_from_valid, y_from_valid))

Learning rate set to 0.056719
0:	learn: 6257.1961980	test: 6050.0835367	best: 6050.0835367 (0)	total: 34.4ms	remaining: 51.6s
100:	learn: 3233.4358992	test: 3030.4447674	best: 3030.4447674 (100)	total: 4.33s	remaining: 1m
200:	learn: 2904.1712088	test: 2753.8727739	best: 2753.8727739 (200)	total: 7.79s	remaining: 50.4s
300:	learn: 2659.7612714	test: 2552.3589476	best: 2552.3589476 (300)	total: 11.1s	remaining: 44.3s
400:	learn: 2494.3780709	test: 2412.4640425	best: 2412.4640425 (400)	total: 14.6s	remaining: 39.9s
500:	learn: 2362.1028281	test: 2303.2001020	best: 2303.2001020 (500)	total: 19.2s	remaining: 38.3s
600:	learn: 2238.6938076	test: 2198.8214910	best: 2198.8214910 (600)	total: 22.6s	remaining: 33.7s
700:	learn: 2138.0712438	test: 2119.8091287	best: 2119.8091287 (700)	total: 25.9s	remaining: 29.5s
800:	learn: 2058.8532475	test: 2052.7598729	best: 2052.7598729 (800)	total: 30.5s	remaining: 26.6s
900:	learn: 1989.2536871	test: 1997.9753583	best: 1997.9753583 (900)	total: 33.8s	rem

<catboost.core.CatBoostRegressor at 0x7f0f863f9cd0>

In [71]:
pd.DataFrame({'feature_name': model.feature_names_, 'feature_importance': model.feature_importances_}).head(18)

Unnamed: 0,feature_name,feature_importance
0,city,0.201636
1,marker_icon,6.861168
2,workplace_type,2.067576
3,company_name,14.078319
4,experience_level,29.457485
5,contract_type,5.910137
6,6,0.0
7,7,0.003986
8,8,0.038025
9,9,0.0


In [78]:
y_from_pred = model.predict(X_text_from_valid)

In [79]:
pd.DataFrame({'y_from_pred': y_from_pred, 'y_from_valid': y_from_valid})

Unnamed: 0,y_from_pred,y_from_valid
7740,22521.364534,21000.0
10643,8307.733511,8500.0
9351,22077.373685,25000.0
7872,19233.808448,17000.0
360,15152.964211,20800.0
...,...,...
5786,23850.569535,28000.0
1195,29445.487419,36000.0
7606,17658.522237,17000.0
7628,11006.586022,11000.0


In [80]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

2255.096788278423

## Tune hyperparams

In [85]:
model = CatBoostRegressor(iterations=1000, cat_features=cat_features, verbose=500)

In [88]:
grid = {'learning_rate': [0.01, 0.03, 0.06, 0.1],
        'depth': [4, 6, 8, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

In [89]:
randomized_search_result = model.randomized_search(grid,
            X_text,
            y_from,
            cv=5,
            shuffle=True,
            train_size=0.8)

0:	learn: 16518.0534390	test: 16608.4874799	best: 16608.4874799 (0)	total: 22.3ms	remaining: 22.3s
500:	learn: 2730.7901101	test: 2692.7155138	best: 2692.7155138 (500)	total: 10s	remaining: 9.98s
999:	learn: 2334.5574019	test: 2322.3484876	best: 2322.3484876 (999)	total: 20.5s	remaining: 0us

bestTest = 2322.348488
bestIteration = 999

0:	loss: 2322.3484876	best: 2322.3484876 (0)	total: 20.9s	remaining: 3m 8s
0:	learn: 16989.2166957	test: 17081.4124930	best: 17081.4124930 (0)	total: 19.6ms	remaining: 19.5s
500:	learn: 3171.1855995	test: 3139.3051664	best: 3139.3051664 (500)	total: 13.3s	remaining: 13.2s
999:	learn: 2766.1880593	test: 2744.6953879	best: 2744.6953879 (999)	total: 23.4s	remaining: 0us

bestTest = 2744.695388
bestIteration = 999

1:	loss: 2744.6953879	best: 2322.3484876 (0)	total: 44.4s	remaining: 2m 57s
0:	learn: 15892.3306667	test: 15980.3039347	best: 15980.3039347 (0)	total: 40.6ms	remaining: 40.6s
500:	learn: 2010.3383450	test: 2081.4270043	best: 2081.4270043 (500)	tot

In [92]:
params = randomized_search_result['params']

In [93]:
params

{'depth': 8, 'l2_leaf_reg': 1, 'learning_rate': 0.06}

In [99]:
model = CatBoostRegressor(iterations=2000, cat_features=cat_features, verbose=500, **params)

In [100]:
model.fit(X_text_from_train, y_from_train, eval_set=(X_text_from_valid, y_from_valid))

0:	learn: 6236.8594592	test: 6032.1097328	best: 6032.1097328 (0)	total: 93.6ms	remaining: 3m 7s
500:	learn: 1850.4797128	test: 1918.8568847	best: 1918.8568847 (500)	total: 49.6s	remaining: 2m 28s
1000:	learn: 1391.6672163	test: 1608.6360635	best: 1608.6360635 (1000)	total: 1m 39s	remaining: 1m 38s
1500:	learn: 1132.3558325	test: 1483.3294384	best: 1483.3294384 (1500)	total: 2m 30s	remaining: 49.9s
1999:	learn: 955.8485442	test: 1409.3464562	best: 1409.3464562 (1999)	total: 3m 19s	remaining: 0us

bestTest = 1409.346456
bestIteration = 1999



<catboost.core.CatBoostRegressor at 0x7f0f862c12b0>

In [101]:
y_from_pred = model.predict(X_text_from_valid)

In [102]:
pd.DataFrame({'y_from_pred': y_from_pred, 'y_from_valid': y_from_valid})

Unnamed: 0,y_from_pred,y_from_valid
7740,21352.123661,21000.0
10643,8539.259097,8500.0
9351,21638.650677,25000.0
7872,18320.901113,17000.0
360,15568.144970,20800.0
...,...,...
5786,23527.333565,28000.0
1195,32734.913282,36000.0
7606,17478.773009,17000.0
7628,10545.483678,11000.0


In [103]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

1947.9040547802526

In [104]:
model.save_model('catboost_1')

In [109]:
model = CatBoostRegressor(iterations=15000, cat_features=cat_features, verbose=500, **params)

In [110]:
model.fit(X_text_from_train, y_from_train, eval_set=(X_text_from_valid, y_from_valid))

0:	learn: 6236.8594592	test: 6032.1097328	best: 6032.1097328 (0)	total: 96.3ms	remaining: 24m 4s
500:	learn: 1850.4797128	test: 1918.8568847	best: 1918.8568847 (500)	total: 50.4s	remaining: 24m 19s
1000:	learn: 1391.6672163	test: 1608.6360635	best: 1608.6360635 (1000)	total: 1m 39s	remaining: 23m 16s
1500:	learn: 1132.3558325	test: 1483.3294384	best: 1483.3294384 (1500)	total: 2m 29s	remaining: 22m 25s
2000:	learn: 955.3488437	test: 1408.9850070	best: 1408.9850070 (2000)	total: 3m 20s	remaining: 21m 40s
2500:	learn: 823.4649912	test: 1368.1856207	best: 1368.1856207 (2500)	total: 4m 9s	remaining: 20m 49s
3000:	learn: 721.4862069	test: 1336.9812138	best: 1336.9094921 (2997)	total: 4m 59s	remaining: 19m 56s
3500:	learn: 639.7413825	test: 1313.4686883	best: 1313.3952086 (3492)	total: 5m 48s	remaining: 19m 4s
4000:	learn: 571.5523122	test: 1296.3299633	best: 1296.2136824 (3998)	total: 6m 39s	remaining: 18m 16s
4500:	learn: 512.4095700	test: 1281.1051032	best: 1281.1051032 (4500)	total: 7m 2

<catboost.core.CatBoostRegressor at 0x7f0f86370940>

In [114]:
model.save_model('catboost_2')

In [111]:
y_from_pred = model.predict(X_text_from_valid)

In [113]:
pd.DataFrame({'y_from_pred': y_from_pred, 'y_from_valid': y_from_valid})

Unnamed: 0,y_from_pred,y_from_valid
7740,20935.171976,21000.0
10643,8438.370548,8500.0
9351,21858.247404,25000.0
7872,17640.181673,17000.0
360,15897.997970,20800.0
...,...,...
5786,24046.739982,28000.0
1195,33467.121761,36000.0
7606,17024.055049,17000.0
7628,11088.978999,11000.0


In [112]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

1746.498538891317

## RF

In [21]:
X_text = pd.DataFrame(np.hstack((X_ohe, text_sparse_matrix.toarray())))

In [22]:
X_text

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2639,2640,2641,2642,2643,2644,2645,2646,2647,2648
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
X_text_from_train, X_text_from_valid, y_from_train, y_from_valid = train_test_split(X_text, y_from, test_size=0.2, random_state=42)

In [24]:
X_text_to_train, X_text_to_valid, y_to_train, y_to_valid = train_test_split(X_text, y_to, test_size=0.2, random_state=42)

In [146]:
rf = RandomForestRegressor(n_estimators=300)

In [147]:
rf.fit(X_text_from_train, y_from_train)

RandomForestRegressor(n_estimators=300)

In [148]:
y_from_pred = rf.predict(X_text_from_valid)

In [149]:
pd.DataFrame({'y_from_pred': y_from_pred, 'y_from_valid': y_from_valid})

Unnamed: 0,y_from_pred,y_from_valid
7740,21000.000000,21000.0
10643,8500.000000,8500.0
9351,24843.200000,25000.0
7872,16946.466667,17000.0
360,12110.833333,20800.0
...,...,...
5786,27936.533333,28000.0
1195,24662.433333,36000.0
7606,16140.320000,17000.0
7628,11096.333333,11000.0


In [150]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

1592.3523102691113

## Try different hyperparameters

In [43]:
rf = RandomForestRegressor(n_estimators=100)

In [44]:
rf.fit(X_text_from_train, y_from_train)

RandomForestRegressor()

In [45]:
y_from_pred = rf.predict(X_text_from_valid)

In [46]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

1594.64084480068

In [36]:
rf = RandomForestRegressor(n_estimators=600)

In [37]:
rf.fit(X_text_from_train, y_from_train)

RandomForestRegressor(n_estimators=600)

In [38]:
y_from_pred = rf.predict(X_text_from_valid)

In [39]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

1595.1121120359387

In [21]:
rf = RandomForestRegressor(n_estimators=1000)

In [22]:
rf.fit(X_text_from_train, y_from_train)

RandomForestRegressor(n_estimators=1000)

In [23]:
y_from_pred = rf.predict(X_text_from_valid)

In [24]:
# overfits
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

1600.7506888724788

## Tune hyperparams

In [25]:
rf = RandomForestRegressor()

In [29]:
grid = {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 300]}

In [39]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = grid, n_iter = 20, cv = 5, random_state=42, n_jobs = -1)

In [41]:
rf_random.fit(X_text, y_from)

RandomizedSearchCV(cv=5,
                   estimator=RandomForestRegressor(max_depth=70,
                                                   max_features='sqrt',
                                                   n_estimators=300),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 300]},
                   random_state=42)

In [42]:
rf_random.best_params_

{'n_estimators': 300,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 90}

In [47]:
rf_best = rf_random.best_estimator_

In [48]:
y_from_pred = rf_best.predict(X_text_from_valid)

In [49]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

702.5530263271226

In [43]:
params = {'n_estimators': 300,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 90}

In [44]:
rf = RandomForestRegressor(**params)

In [45]:
rf.fit(X_text_from_train, y_from_train)

RandomForestRegressor(max_depth=90, max_features='sqrt', min_samples_split=5,
                      n_estimators=300)

In [46]:
y_from_pred = rf.predict(X_text_from_valid)

In [47]:
np.sqrt(mean_squared_error(y_from_pred, y_from_valid))

1629.6816747515322