In [210]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression, RidgeCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans

import sklearn.metrics as metrics

import spacy
import nltk

In [3]:
df = pd.read_csv('./clean_df.csv')

In [4]:
df.head()

Unnamed: 0,job_title,job_description,data_listed,location,salary
0,Data Scientist,Where you’ll be working:You will be working wi...,'10 Feb 2020','Sydney',146000.0
1,Data Scientist,\n\n\n\n\n\n Data Scientist | Python | Strong ...,'5 Feb 2020','Sydney',130000.0
2,Data Scientist,About the company My client is a company that...,'30 Jan 2020','Melbourne',160000.0
3,Data Scientist,About Them: This tech start up are on a mis...,'3 Feb 2020','Sydney',174720.0
4,Data Scientist,A brand new and exciting month contract has ...,'4 Feb 2020','Melbourne',255500.0


In [5]:
spacy_nlp = spacy.load('en_core_web_sm')

In [7]:
df['nltk_tokens'] = df.job_description.apply(nltk.word_tokenize)

In [9]:
df.head()

Unnamed: 0,job_title,job_description,data_listed,location,salary,nltk_tokens
0,Data Scientist,Where you’ll be working:You will be working wi...,'10 Feb 2020','Sydney',146000.0,"[Where, you, ’, ll, be, working, :, You, will,..."
1,Data Scientist,\n\n\n\n\n\n Data Scientist | Python | Strong ...,'5 Feb 2020','Sydney',130000.0,"[Data, Scientist, |, Python, |, Strong, Busine..."
2,Data Scientist,About the company My client is a company that...,'30 Jan 2020','Melbourne',160000.0,"[About, the, company, My, client, is, a, compa..."
3,Data Scientist,About Them: This tech start up are on a mis...,'3 Feb 2020','Sydney',174720.0,"[About, Them, :, This, tech, start, up, are, o..."
4,Data Scientist,A brand new and exciting month contract has ...,'4 Feb 2020','Melbourne',255500.0,"[A, brand, new, and, exciting, month, contract..."


In [10]:
df['spacy_tokens'] = df.job_description.apply(spacy_nlp)

In [40]:
df_spacy = df[['spacy_tokens', 'salary']]

In [41]:
df_spacy.head()

Unnamed: 0,spacy_tokens,salary
0,"(Where, you, ’ll, be, working, :, You, will, b...",146000.0
1,"(\n\n\n\n\n\n , Data, Scientist, |, Python, |,...",130000.0
2,"(About, the, company, , My, client, is, a, co...",160000.0
3,"( , About, Them, :, This, tech, start, up, a...",174720.0
4,"(A, brand, new, and, exciting, , month, cont...",255500.0


In [42]:
df_nltk = df[['nltk_tokens', 'salary']]

In [43]:
df_nltk.head()

Unnamed: 0,nltk_tokens,salary
0,"[Where, you, ’, ll, be, working, :, You, will,...",146000.0
1,"[Data, Scientist, |, Python, |, Strong, Busine...",130000.0
2,"[About, the, company, My, client, is, a, compa...",160000.0
3,"[About, Them, :, This, tech, start, up, are, o...",174720.0
4,"[A, brand, new, and, exciting, month, contract...",255500.0


In [38]:
location_dummies = pd.get_dummies(df.location,drop_first=True)

In [39]:
location_dummies

Unnamed: 0,'Adelaide','Brisbane','Horsham & Grampians','Melbourne','Perth','Sydney'
0,0,0,0,0,0,1
1,0,0,0,0,0,1
2,0,0,0,1,0,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
...,...,...,...,...,...,...
203,0,0,0,1,0,0
204,0,0,0,0,0,1
205,0,0,0,1,0,0
206,0,0,0,1,0,0


In [44]:
df_nltk = df_nltk.join(location_dummies)

In [45]:
df_nltk.head()

Unnamed: 0,nltk_tokens,salary,'Adelaide','Brisbane','Horsham & Grampians','Melbourne','Perth','Sydney'
0,"[Where, you, ’, ll, be, working, :, You, will,...",146000.0,0,0,0,0,0,1
1,"[Data, Scientist, |, Python, |, Strong, Busine...",130000.0,0,0,0,0,0,1
2,"[About, the, company, My, client, is, a, compa...",160000.0,0,0,0,1,0,0
3,"[About, Them, :, This, tech, start, up, are, o...",174720.0,0,0,0,0,0,1
4,"[A, brand, new, and, exciting, month, contract...",255500.0,0,0,0,1,0,0


In [46]:
df_spacy = df_spacy.join(location_dummies)

In [47]:
df_spacy.head()

Unnamed: 0,spacy_tokens,salary,'Adelaide','Brisbane','Horsham & Grampians','Melbourne','Perth','Sydney'
0,"(Where, you, ’ll, be, working, :, You, will, b...",146000.0,0,0,0,0,0,1
1,"(\n\n\n\n\n\n , Data, Scientist, |, Python, |,...",130000.0,0,0,0,0,0,1
2,"(About, the, company, , My, client, is, a, co...",160000.0,0,0,0,1,0,0
3,"( , About, Them, :, This, tech, start, up, a...",174720.0,0,0,0,0,0,1
4,"(A, brand, new, and, exciting, , month, cont...",255500.0,0,0,0,1,0,0


In [48]:
df_nltk.nltk_tokens[1]

['Data',
 'Scientist',
 '|',
 'Python',
 '|',
 'Strong',
 'Business',
 'Acumen',
 '|',
 'Stakeholder',
 'Management',
 '|',
 '$',
 'k',
 'base',
 'Working',
 'closely',
 'with',
 'the',
 'Senior',
 'Manager',
 ',',
 'you',
 'will',
 'support',
 'the',
 'business',
 'units',
 'with',
 'their',
 'business',
 'problems',
 'via',
 'an',
 'automated',
 'machine',
 'learning',
 'platform',
 '.',
 'This',
 'is',
 'an',
 'exciting',
 'time',
 'where',
 'you',
 'will',
 'be',
 'working',
 'collaboratively',
 'with',
 'various',
 'business',
 'units',
 'to',
 'deliver',
 'insightful',
 'solutions',
 'through',
 'the',
 'application',
 'of',
 'machine',
 'learning',
 'capabilities',
 '.',
 'You',
 'will',
 'be',
 'responsible',
 'for',
 'building',
 'and',
 'managing',
 'collaborative',
 'relationships',
 'with',
 'a',
 'broad',
 'audience',
 'of',
 'stakeholders',
 ',',
 'where',
 'you',
 'will',
 'represent',
 'the',
 'team',
 'and',
 'it',
 '’',
 's',
 'capability',
 'as',
 'the',
 'banks',
 '

In [51]:
temp = df_spacy.spacy_tokens[0]

In [54]:
temp.vector

array([ 0.1751815 ,  0.4683458 , -0.5824294 , -0.3474471 ,  1.3018603 ,
        0.17673367,  1.0576912 ,  0.19420584,  1.2329892 ,  1.3149885 ,
       -0.51645947,  0.08342735, -0.03944112, -0.6357234 , -0.69495726,
       -0.33003676, -0.79249334,  0.14235398, -0.07943974, -0.41328293,
        0.6194853 ,  0.43071178, -0.09145882, -0.01852435, -0.68217725,
        0.6646534 , -0.5593379 , -0.19931008,  0.6435791 , -0.71425605,
        0.6878975 ,  0.43369222, -0.16717143, -0.5017103 ,  0.00427351,
       -0.6358806 ,  0.5735309 , -0.93152535, -1.1835092 , -0.422721  ,
        1.2812802 ,  0.02174864, -0.27333564, -1.4063026 ,  0.22127613,
       -0.4972761 , -0.12656614,  0.06961485, -0.8874551 ,  0.40918016,
        1.0834715 , -0.7537637 , -0.2198047 ,  0.02641642, -1.8234868 ,
        0.5627237 ,  0.91176337,  0.54337364, -0.11680464,  0.32564875,
        0.7510442 , -0.10620414,  1.1129076 ,  0.74302554,  0.5128363 ,
       -0.43736348,  0.8140763 , -0.8553609 , -0.06546023,  0.57

In [63]:
df_spacy.head()

Unnamed: 0,spacy_tokens,salary,'Adelaide','Brisbane','Horsham & Grampians','Melbourne','Perth','Sydney'
0,"(Where, you, ’ll, be, working, :, You, will, b...",146000.0,0,0,0,0,0,1
1,"(\n\n\n\n\n\n , Data, Scientist, |, Python, |,...",130000.0,0,0,0,0,0,1
2,"(About, the, company, , My, client, is, a, co...",160000.0,0,0,0,1,0,0
3,"( , About, Them, :, This, tech, start, up, a...",174720.0,0,0,0,0,0,1
4,"(A, brand, new, and, exciting, , month, cont...",255500.0,0,0,0,1,0,0


In [97]:
df_vect = df_spacy.spacy_tokens.apply(lambda x: pd.Series(x.vector))
# df_spacy.join()
df_vect.columns = ['vect_'+str(col) for col in df_vect.columns]

In [103]:
df_spacy = df_spacy.join(df_vect)

In [104]:
df_spacy

Unnamed: 0,spacy_tokens,salary,'Adelaide','Brisbane','Horsham & Grampians','Melbourne','Perth','Sydney',doc_vect,vect_0,vect_1,vect_2,vect_3,vect_4,vect_5,vect_6,vect_7,vect_8,vect_9,vect_10,vect_11,vect_12,vect_13,vect_14,vect_15,vect_16,vect_17,vect_18,vect_19,vect_20,vect_21,vect_22,vect_23,vect_24,vect_25,vect_26,vect_27,vect_28,vect_29,vect_30,...,vect_56,vect_57,vect_58,vect_59,vect_60,vect_61,vect_62,vect_63,vect_64,vect_65,vect_66,vect_67,vect_68,vect_69,vect_70,vect_71,vect_72,vect_73,vect_74,vect_75,vect_76,vect_77,vect_78,vect_79,vect_80,vect_81,vect_82,vect_83,vect_84,vect_85,vect_86,vect_87,vect_88,vect_89,vect_90,vect_91,vect_92,vect_93,vect_94,vect_95
0,"(Where, you, ’ll, be, working, :, You, will, b...",146000.0,0,0,0,0,0,1,0.175181,0.175181,0.468346,-0.582429,-0.347447,1.301860,0.176734,1.057691,0.194206,1.232989,1.314988,-0.516459,0.083427,-0.039441,-0.635723,-0.694957,-0.330037,-0.792493,0.142354,-0.079440,-0.413283,0.619485,0.430712,-0.091459,-0.018524,-0.682177,0.664653,-0.559338,-0.199310,0.643579,-0.714256,0.687898,...,0.911763,0.543374,-0.116805,0.325649,0.751044,-0.106204,1.112908,0.743026,0.512836,-0.437363,0.814076,-0.855361,-0.065460,0.570108,-0.126326,-0.409668,0.436804,-0.200432,-0.375215,0.003126,1.228446,-0.481059,-0.493038,0.025715,0.179889,-0.596012,0.300821,-0.157014,0.348382,-0.721366,0.310841,0.594217,-0.112513,0.771043,-0.583683,-0.590057,-0.018184,0.368517,0.745931,0.574292
1,"(\n\n\n\n\n\n , Data, Scientist, |, Python, |,...",130000.0,0,0,0,0,0,1,0.320486,0.320486,0.077317,-0.635126,-0.089653,1.156245,0.464618,0.858193,0.183135,1.219160,1.292969,0.412154,-0.114006,-0.088810,-0.757475,-0.873471,-0.601357,-0.300223,0.536307,-0.242936,-0.760369,0.432710,0.034010,-0.132761,0.083341,-0.691452,0.298634,-0.504929,-0.462753,0.600076,-0.691240,0.842806,...,0.814771,0.489524,0.149091,-0.174809,0.931606,-0.120450,0.953311,0.631855,0.719921,-0.393082,0.780864,-1.207795,-0.393438,1.057563,0.050393,-0.318684,0.293560,-0.041619,-0.302185,-0.115814,0.761447,-0.170666,-0.660946,0.041008,0.372774,-0.178629,-0.147950,-0.178352,0.948244,-0.016967,-0.045430,0.124820,-0.101947,0.338594,-0.434264,-0.500447,-0.148625,0.294274,0.715475,0.428512
2,"(About, the, company, , My, client, is, a, co...",160000.0,0,0,0,1,0,0,0.364088,0.364088,0.429866,-0.723932,-0.100617,0.832898,0.561835,0.649823,0.210984,1.309685,1.275150,-0.026259,0.266491,0.240672,-0.943805,-1.072489,-0.082275,-0.315484,0.538983,-0.147985,-0.571342,0.718001,0.148629,-0.030019,-0.300591,-0.599662,0.146551,-0.303593,-0.645660,0.815169,-0.896805,0.701253,...,0.664704,0.849070,0.376919,-0.051639,0.633538,-0.349542,1.049748,0.799691,0.734326,-0.034238,0.675317,-0.760145,-0.503574,0.501130,-0.228099,-0.357928,0.427537,-0.168199,-0.146277,-0.133557,1.337775,-0.152139,-0.681011,-0.193943,0.360539,-0.734201,-0.064689,-0.024489,0.527592,-0.183166,0.458620,0.264191,-0.267835,0.615222,-0.248620,-0.353820,-0.081186,0.527506,0.941397,0.509030
3,"( , About, Them, :, This, tech, start, up, a...",174720.0,0,0,0,0,0,1,-0.284363,-0.284363,0.681412,-0.762081,-0.591352,0.970405,-0.278230,0.748061,0.396293,1.337570,1.282670,-0.486605,0.171097,0.021715,-1.049415,-0.426292,0.005485,-0.724414,0.290305,0.038919,-0.384002,0.414981,0.753726,-0.471845,0.059407,-0.658568,0.383134,-0.321350,-0.385147,0.699161,-0.551423,0.997180,...,0.873400,0.412509,0.052963,0.346859,1.161143,-0.454296,0.957399,0.979748,0.260260,-0.734339,0.819183,-0.581413,-0.165721,0.736496,-0.119108,-0.346906,0.610919,-0.161685,-0.395110,0.022090,1.186009,-0.653418,-0.400897,-0.267109,0.081643,-0.144330,0.029687,0.118011,0.290745,-0.790925,0.427250,0.619488,-0.297721,0.533420,-0.380035,-0.520890,-0.178081,0.801915,0.655478,0.423992
4,"(A, brand, new, and, exciting, , month, cont...",255500.0,0,0,0,1,0,0,0.347709,0.347709,0.400837,-0.572793,0.195458,1.272772,-0.100759,0.853378,0.225396,1.103274,1.245723,-0.250242,0.372487,0.099050,-1.293151,-0.919797,0.095126,-0.811796,0.456146,-0.043034,-0.511876,0.815692,0.450824,-0.136184,0.191292,-0.473110,0.507055,-0.567763,-0.416371,1.057558,-0.734010,0.697533,...,0.949729,0.776088,0.213818,0.214411,0.653507,-0.591095,1.014277,0.625327,0.689631,-0.405434,0.701689,-0.585417,-0.121178,0.967336,0.073504,-0.802106,0.388792,-0.067063,-0.500691,-0.349044,1.128995,-0.575411,-0.204979,-0.147748,0.275575,-0.710441,0.284628,-0.152107,0.153310,-0.629748,0.315525,0.347191,-0.031580,0.615572,-0.189184,-0.736085,-0.125790,0.355433,0.864529,0.508242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,"(Robertson, Search, is, part, of, a, , -year,...",174720.0,0,0,0,1,0,0,0.029669,0.029669,0.571959,-0.374851,-0.440346,1.193242,-0.045499,0.857747,0.021178,1.094673,1.318189,-0.545913,0.021533,-0.154619,-0.879297,-0.695938,0.008681,-0.748276,0.108468,-0.132923,-0.365791,0.258725,0.500405,-0.199084,0.038755,-0.573159,0.691519,-0.558536,-0.115924,0.600614,-0.642175,0.958060,...,0.693849,0.689816,0.247332,0.145292,0.966673,-0.559476,1.031501,0.755732,0.396564,-0.445151,0.611197,-0.997755,-0.026809,0.857477,-0.003466,-0.404435,0.643324,-0.231670,-0.302005,0.014933,1.066707,-0.671960,-0.537643,-0.292722,0.005247,-0.358987,0.224694,-0.134749,0.315339,-0.306148,0.560827,0.542052,-0.198382,0.706032,-0.587711,-0.624154,-0.034141,0.620912,0.763327,0.354494
204,"(About, the, business, , GQR, is, working, o...",150000.0,0,0,0,0,0,1,0.147663,0.147663,0.279606,-0.553207,-0.076636,1.171908,-0.033587,0.769483,-0.058771,1.192218,1.272084,-0.276170,0.163718,0.015495,-1.346499,-0.834175,-0.001732,-0.720273,0.353926,0.209494,-0.417775,0.993399,0.435990,-0.209079,0.167430,-0.596784,0.325592,-0.526061,-0.571709,0.906959,-0.743850,0.742173,...,0.921292,0.616814,0.042404,0.314789,0.852859,-0.553336,1.152175,0.768898,0.510845,-0.401736,0.703123,-0.722671,-0.249433,0.911707,0.132912,-0.780428,0.858063,0.087112,-0.736911,-0.041532,1.094217,-0.580453,-0.046597,-0.388823,0.410096,-0.352434,0.104386,-0.119396,0.238739,-0.531695,0.383600,0.377833,-0.113103,0.804198,-0.255575,-0.567209,-0.250944,0.405198,1.096338,0.379194
205,"(About, our, Client, :, , My, client, is, on...",140000.0,0,0,0,1,0,0,0.153717,0.153717,0.453259,-0.723383,-0.147037,1.330099,0.010768,0.529982,0.122287,1.187081,1.238251,-0.229351,0.211308,0.145229,-1.148216,-1.026922,-0.119332,-0.643581,0.277002,0.028442,-0.469047,0.767283,0.509216,0.037989,0.207878,-0.489945,0.480734,-0.563888,-0.378817,0.506285,-0.751653,0.654174,...,0.471755,0.822815,0.336985,0.237681,0.892917,-0.331061,1.111614,0.772060,0.626032,-0.329988,0.918260,-0.798687,-0.345989,0.991036,0.063951,-0.479610,0.873179,-0.096459,-0.358861,-0.247812,1.123442,-0.674929,-0.471550,0.017829,0.200182,-0.530889,0.170617,-0.321290,0.503538,-0.340580,0.312884,0.429231,-0.410945,0.691598,-0.489783,-0.558493,-0.016363,0.431670,0.793844,0.454642
206,"(Robertson, Search, is, part, of, a, , -year,...",174720.0,0,0,0,1,0,0,0.029669,0.029669,0.571959,-0.374851,-0.440346,1.193242,-0.045499,0.857747,0.021178,1.094673,1.318189,-0.545913,0.021533,-0.154619,-0.879297,-0.695938,0.008681,-0.748276,0.108468,-0.132923,-0.365791,0.258725,0.500405,-0.199084,0.038755,-0.573159,0.691519,-0.558536,-0.115924,0.600614,-0.642175,0.958060,...,0.693849,0.689816,0.247332,0.145292,0.966673,-0.559476,1.031501,0.755732,0.396564,-0.445151,0.611197,-0.997755,-0.026809,0.857477,-0.003466,-0.404435,0.643324,-0.231670,-0.302005,0.014933,1.066707,-0.671960,-0.537643,-0.292722,0.005247,-0.358987,0.224694,-0.134749,0.315339,-0.306148,0.560827,0.542052,-0.198382,0.706032,-0.587711,-0.624154,-0.034141,0.620912,0.763327,0.354494


In [105]:
y = df_spacy.salary

In [120]:
df_spacy_vect = df_spacy.drop(['doc_vect','spacy_tokens', 'salary'],axis=1)

In [121]:
df_spacy_vect.head()

Unnamed: 0,'Adelaide','Brisbane','Horsham & Grampians','Melbourne','Perth','Sydney',vect_0,vect_1,vect_2,vect_3,vect_4,vect_5,vect_6,vect_7,vect_8,vect_9,vect_10,vect_11,vect_12,vect_13,vect_14,vect_15,vect_16,vect_17,vect_18,vect_19,vect_20,vect_21,vect_22,vect_23,vect_24,vect_25,vect_26,vect_27,vect_28,vect_29,vect_30,vect_31,vect_32,vect_33,...,vect_56,vect_57,vect_58,vect_59,vect_60,vect_61,vect_62,vect_63,vect_64,vect_65,vect_66,vect_67,vect_68,vect_69,vect_70,vect_71,vect_72,vect_73,vect_74,vect_75,vect_76,vect_77,vect_78,vect_79,vect_80,vect_81,vect_82,vect_83,vect_84,vect_85,vect_86,vect_87,vect_88,vect_89,vect_90,vect_91,vect_92,vect_93,vect_94,vect_95
0,0,0,0,0,0,1,0.175181,0.468346,-0.582429,-0.347447,1.30186,0.176734,1.057691,0.194206,1.232989,1.314988,-0.516459,0.083427,-0.039441,-0.635723,-0.694957,-0.330037,-0.792493,0.142354,-0.07944,-0.413283,0.619485,0.430712,-0.091459,-0.018524,-0.682177,0.664653,-0.559338,-0.19931,0.643579,-0.714256,0.687898,0.433692,-0.167171,-0.50171,...,0.911763,0.543374,-0.116805,0.325649,0.751044,-0.106204,1.112908,0.743026,0.512836,-0.437363,0.814076,-0.855361,-0.06546,0.570108,-0.126326,-0.409668,0.436804,-0.200432,-0.375215,0.003126,1.228446,-0.481059,-0.493038,0.025715,0.179889,-0.596012,0.300821,-0.157014,0.348382,-0.721366,0.310841,0.594217,-0.112513,0.771043,-0.583683,-0.590057,-0.018184,0.368517,0.745931,0.574292
1,0,0,0,0,0,1,0.320486,0.077317,-0.635126,-0.089653,1.156245,0.464618,0.858193,0.183135,1.21916,1.292969,0.412154,-0.114006,-0.08881,-0.757475,-0.873471,-0.601357,-0.300223,0.536307,-0.242936,-0.760369,0.43271,0.03401,-0.132761,0.083341,-0.691452,0.298634,-0.504929,-0.462753,0.600076,-0.69124,0.842806,0.303082,-0.257435,-0.531937,...,0.814771,0.489524,0.149091,-0.174809,0.931606,-0.12045,0.953311,0.631855,0.719921,-0.393082,0.780864,-1.207795,-0.393438,1.057563,0.050393,-0.318684,0.29356,-0.041619,-0.302185,-0.115814,0.761447,-0.170666,-0.660946,0.041008,0.372774,-0.178629,-0.14795,-0.178352,0.948244,-0.016967,-0.04543,0.12482,-0.101947,0.338594,-0.434264,-0.500447,-0.148625,0.294274,0.715475,0.428512
2,0,0,0,1,0,0,0.364088,0.429866,-0.723932,-0.100617,0.832898,0.561835,0.649823,0.210984,1.309685,1.27515,-0.026259,0.266491,0.240672,-0.943805,-1.072489,-0.082275,-0.315484,0.538983,-0.147985,-0.571342,0.718001,0.148629,-0.030019,-0.300591,-0.599662,0.146551,-0.303593,-0.64566,0.815169,-0.896805,0.701253,0.05754,-0.196826,-0.432972,...,0.664704,0.84907,0.376919,-0.051639,0.633538,-0.349542,1.049748,0.799691,0.734326,-0.034238,0.675317,-0.760145,-0.503574,0.50113,-0.228099,-0.357928,0.427537,-0.168199,-0.146277,-0.133557,1.337775,-0.152139,-0.681011,-0.193943,0.360539,-0.734201,-0.064689,-0.024489,0.527592,-0.183166,0.45862,0.264191,-0.267835,0.615222,-0.24862,-0.35382,-0.081186,0.527506,0.941397,0.50903
3,0,0,0,0,0,1,-0.284363,0.681412,-0.762081,-0.591352,0.970405,-0.27823,0.748061,0.396293,1.33757,1.28267,-0.486605,0.171097,0.021715,-1.049415,-0.426292,0.005485,-0.724414,0.290305,0.038919,-0.384002,0.414981,0.753726,-0.471845,0.059407,-0.658568,0.383134,-0.32135,-0.385147,0.699161,-0.551423,0.99718,0.642418,-0.115145,-0.270197,...,0.8734,0.412509,0.052963,0.346859,1.161143,-0.454296,0.957399,0.979748,0.26026,-0.734339,0.819183,-0.581413,-0.165721,0.736496,-0.119108,-0.346906,0.610919,-0.161685,-0.39511,0.02209,1.186009,-0.653418,-0.400897,-0.267109,0.081643,-0.14433,0.029687,0.118011,0.290745,-0.790925,0.42725,0.619488,-0.297721,0.53342,-0.380035,-0.52089,-0.178081,0.801915,0.655478,0.423992
4,0,0,0,1,0,0,0.347709,0.400837,-0.572793,0.195458,1.272772,-0.100759,0.853378,0.225396,1.103274,1.245723,-0.250242,0.372487,0.09905,-1.293151,-0.919797,0.095126,-0.811796,0.456146,-0.043034,-0.511876,0.815692,0.450824,-0.136184,0.191292,-0.47311,0.507055,-0.567763,-0.416371,1.057558,-0.73401,0.697533,0.352259,-0.06854,-0.495115,...,0.949729,0.776088,0.213818,0.214411,0.653507,-0.591095,1.014277,0.625327,0.689631,-0.405434,0.701689,-0.585417,-0.121178,0.967336,0.073504,-0.802106,0.388792,-0.067063,-0.500691,-0.349044,1.128995,-0.575411,-0.204979,-0.147748,0.275575,-0.710441,0.284628,-0.152107,0.15331,-0.629748,0.315525,0.347191,-0.03158,0.615572,-0.189184,-0.736085,-0.12579,0.355433,0.864529,0.508242


In [88]:
df_spacy_vect
# .doc_vect[9]

Unnamed: 0,'Adelaide','Brisbane','Horsham & Grampians','Melbourne','Perth','Sydney',doc_vect
0,0,0,0,0,0,1,"[0.1751815, 0.4683458, -0.5824294, -0.3474471,..."
1,0,0,0,0,0,1,"[0.32048553, 0.07731729, -0.6351257, -0.089653..."
2,0,0,0,1,0,0,"[0.36408767, 0.42986587, -0.7239318, -0.100617..."
3,0,0,0,0,0,1,"[-0.2843631, 0.6814115, -0.7620812, -0.5913523..."
4,0,0,0,1,0,0,"[0.34770873, 0.4008369, -0.5727932, 0.19545771..."
...,...,...,...,...,...,...,...
203,0,0,0,1,0,0,"[0.029668827, 0.5719587, -0.37485087, -0.44034..."
204,0,0,0,0,0,1,"[0.14766331, 0.2796057, -0.55320746, -0.076635..."
205,0,0,0,1,0,0,"[0.15371738, 0.45325887, -0.72338283, -0.14703..."
206,0,0,0,1,0,0,"[0.029668827, 0.5719587, -0.37485087, -0.44034..."


In [122]:
X_train, X_test, y_train, y_test = train_test_split(df_spacy_vect,y)

In [206]:
lbm_regr = LGBMRegressor(boosting_type='gbdt',max_depth=3, n_estimators=1000, learning_rate=0.01)

In [207]:
lbm_regr.fit(X_train,y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=3,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [208]:
lbm_regr.score(X_test,y_test)

0.12191631007263891

In [209]:
metrics.r2_score(lbm_regr.predict(X_test),y_test)

-2.607598280361312

In [200]:
metrics.median_absolute_error(lbm_regr.predict(X_test),y_test)

43907.4866722695

In [211]:
ridge = RidgeCV()

In [212]:
ridge.fit(X_train,y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [216]:
metrics.median_absolute_error(ridge.predict(X_test),y_test)

50208.33786475504