In [None]:
import numpy as np
import pandas as pd

import re
import string

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression, RidgeCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from sklearn.preprocessing import MultiLabelBinarizer, Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer

import sklearn.metrics as metrics

import spacy
import eli5

from eli5.lime import TextExplainer

In [None]:
df = pd.read_csv('./clean_df.csv')

In [None]:
df.head()

In [None]:
spacy_nlp = spacy.load('en_core_web_sm')

In [None]:
df.head()

In [None]:
df['spacy_tokens'] = df.job_description.apply(spacy_nlp)

In [None]:
df_spacy = df[['spacy_tokens', 'salary']]

In [None]:
df_spacy.head()

In [None]:
temp = df_spacy.spacy_tokens[3]

In [None]:
temp.ents

In [None]:
for i in temp.noun_chunks:
    print(i)

### Manualling creating a word count vect but based of spacy's noun chunks

In [None]:
spacy_tokens = df_spacy.spacy_tokens.apply(lambda x: [y.text for y in x.noun_chunks])

In [None]:
biner = MultiLabelBinarizer()
chunk_counts = biner.fit_transform(spacy_tokens)

In [None]:
feature_names = []
for i in biner.classes_:
    feature_names.append(i.replace('\n',''))

In [None]:
feature_names[0]

In [None]:
chonk_df = pd.DataFrame(columns=feature_names,data=chunk_counts)

In [None]:
chonk_df.head()

In [None]:
trimmed_features = chonk_df.sum().sort_values(ascending=False).iloc[:1000]

In [None]:
chonk_trimmed = chonk_df[trimmed_features.index]

In [None]:
list(trimmed_features.index)

In [None]:
chonk_trimmed.drop(['PRIVACY POLICY.http://www.bluefinresources.com.au/privacy-policy', '|', '%', 'https://services.anu.edu.au/human-resources/respect-inclusionApplication information'],axis=1,inplace=True)

In [None]:
chonk_trimmed.head()

In [None]:
chonk_df.shape

In [None]:
y = df_spacy.salary

In [None]:
y_catog = pd.qcut(y,2,[0,1])

In [None]:
chonk_X_train_clas, chonk_X_test_clas, chonk_y_train_clas, chonk_y_test_clas =  train_test_split(chonk_trimmed,y_catog)

In [None]:
chonk_X_train, chonk_X_test, chonk_y_train, chonk_y_test =  train_test_split(chonk_trimmed,y)

### Modelling manual spacy noun vect with terrible results

In [None]:
light_regres = LGBMRegressor()

In [None]:
linreg = LinearRegression(normalize=True)

In [None]:
linreg.fit(chonk_X_train,chonk_y_train)

In [None]:
linreg.score(chonk_X_test,chonk_y_test)

In [None]:
metrics.median_absolute_error(chonk_y_test,linreg.predict(chonk_X_test))

In [None]:
ridge = RidgeCV(normalize=True,alphas=(np.linspace(1,250)))

In [None]:
ridge.fit(chonk_X_train,chonk_y_train)

In [None]:
ridge.score(chonk_X_test,chonk_y_test)

In [None]:
metrics.median_absolute_error(chonk_y_test,ridge.predict(chonk_X_test))

In [None]:
rf = RandomForestClassifier()

In [None]:
rf.fit(chonk_X_train_clas,chonk_y_train_clas)

In [None]:
rf.score(chonk_X_test_clas,chonk_y_test_clas)

In [None]:
y_catog.value_counts(normalize=True)

In [None]:
for i in temp.ents:
    print(i.text)
    
    

In [None]:
location_dummies = pd.get_dummies(df.location,drop_first=True)

In [None]:
location_dummies

In [None]:
df_spacy = df_spacy.join(location_dummies)

In [None]:
df_spacy.head()

### Trying modelling on spacy document vectors

In [None]:
temp = df_spacy.spacy_tokens[0]

In [None]:
temp.vector

In [None]:
df_spacy.head()

In [None]:
df_vect = df_spacy.spacy_tokens.apply(lambda x: pd.Series(x.vector))
# df_spacy.join()
df_vect.columns = ['vect_'+str(col) for col in df_vect.columns]

In [None]:
df_spacy = df_spacy.join(df_vect)

In [None]:
df_spacy.head()

In [None]:
y = df_spacy.salary

In [None]:
df_spacy_vect = df_spacy.drop(['spacy_tokens', 'salary'],axis=1)

In [None]:
df_spacy_vect.head()

### Again terrible model results, starting to think there is an issue with the raw data. maybe not enough??

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_spacy_vect,y)

In [None]:
lbm_regr = LGBMRegressor(boosting_type='gbdt',learning_rate=0.05,max_depth=3,n_estimators=1500,num_leaves=15)

In [None]:
lbm_params = {'max_depth': [3,5,6,9,12], 'num_leaves': [15, 31, 63, 127, 255, 511, 1023, 2047, 4095], 'n_estimators': [100,500,1000,1300,1500]}

In [None]:
grid_search = GridSearchCV(lbm_regr,lbm_params,n_jobs=-1)
grid_search.fit(X_train,y_train)

In [None]:
grid_search.best_params_

In [None]:
lbm_regr.fit(X_train,y_train)

In [None]:
lbm_regr.score(X_test,y_test)

In [None]:
metrics.r2_score(lbm_regr.predict(X_test),y_test)

In [None]:
metrics.median_absolute_error(lbm_regr.predict(X_test),y_test)

In [None]:
ridge = RidgeCV()

In [None]:
ridge.fit(X_train,y_train)

In [None]:
metrics.median_absolute_error(ridge.predict(X_test),y_test)

In [None]:
y_catog

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_spacy_vect,y_catog)

In [None]:
lbm_clas = LGBMClassifier()

In [None]:
lbm_clas.fit(X_train,y_train)

In [None]:
lbm_clas.score(X_test,y_test)

In [None]:
y_catog.value_counts(normalize=True)

In [None]:
eli5.explain_weights(lbm_clas)

### Trying Naive Bayes on Tfidf Vector of job description

In [None]:
t_vect = TfidfVectorizer(stop_words='english')

In [None]:
vect = t_vect.fit_transform(df.job_description)

In [None]:
y = pd.qcut(df.salary,3,[1,2,3])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vect,y)

In [None]:
Mm_baybay = MultinomialNB()

In [None]:
Mm_baybay.fit(X_train,y_train)

In [None]:
Mm_baybay.score(X_test,y_test)

In [None]:
Mm_baybay.classes_

In [None]:
pipe = make_pipeline(t_vect,Normalizer(),Mm_baybay)

In [None]:
df.job_description[0]

In [None]:
Mm_baybay.fit(X_train,y_train)

In [None]:
te = TextExplainer()
te.fit(df.job_description[0],pipe.predict_proba)

In [None]:
te.show_prediction()

In [None]:
vect.get_feature_names()

### Slightly better results. Still think something is up with raw data.

### going to import and use pre-prepared job ad data to run some models to see if there is an issue with my scrapped data

In [None]:
df = pd.read_csv('./validation.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
type(df['job_description_all_text.1'])

In [None]:
type(df.Bag_of_words[0])

In [None]:
temp = df.job_description_all_text.iloc[4746]

In [None]:
remove_salary(temp)

In [None]:
temp.isna().sum()

In [None]:
def remove_salary(x):
    x = re.sub('.*\$(\d+,\d).*',string=x,repl='')
    x = re.sub('.*\$(\d+).*',string=x,repl='')
    return x

In [None]:
y = df.salary

In [None]:

# plt

In [None]:
y.plot(kind='density')

In [None]:
df.job_description_all_text = df.job_description_all_text.apply(remove_salary)

In [None]:
df.job_title = df.job_title.apply(remove_salary)

In [None]:
df.iloc[4746]

In [None]:
df.drop(['job_description_all_text.1' ,'Bag_of_words'],inplace=True,axis=1)

In [None]:
df.iloc[4746,0:1]

In [None]:
df.job_title.isna().sum()

In [None]:
dums = pd.get_dummies(df[['company','location','industry']])

In [None]:
df_1 = df.join(dums)

In [None]:
df_1.head()

In [None]:
df_1.drop(['company','location','industry','salary'],inplace=True,axis=1)

In [None]:
df_1.head()

In [None]:
y_binary = pd.qcut(y,2,[0,1])

In [None]:
y_binary

In [None]:
lbm_clasif = LGBMClassifier(objective='binary',learning_rate=0.05)

In [None]:
t_vect = TfidfVectorizer(max_df=300,max_features=1000,min_df=10,ngram_range=(1,3))
# Params for job_description - max_df=150,max_features=1000,min_df=10,ngram_range=(1,3)
# Params for job_title - max_df=300,max_features=1000,min_df=10,ngram_range=(1,3)

In [None]:
pipe = make_pipeline(t_vect,Normalizer(),lbm_clasif)

In [None]:
pipe.get_params().keys()

In [None]:
params = {'tfidfvectorizer__ngram_range':[(1,3),(1,6)],'tfidfvectorizer__max_df':[300,500,1000],'tfidfvectorizer__max_features': [1000],'tfidfvectorizer__min_df':[10],
         }

In [None]:
grid_pipe = GridSearchCV(pipe, params,verbose=1,n_jobs=-1)

In [None]:
grid_pipe.fit(df_1.job_description_all_text,y_binary)

In [None]:
grid_pipe.best_params_

In [None]:
job_descr_tvec = t_vect.fit_transform(df_1.job_description_all_text)

In [None]:
job_descr_tvec

In [None]:
grid_pipe.fit(df_1.job_title,y_binary)

In [None]:
grid_pipe.best_params_

In [None]:
job_title_tvec = t_vect.fit_transform(df_1.job_title)

In [None]:
job_des_df = pd.DataFrame(job_descr_tvec.todense(), columns=t_vect.get_feature_names())

In [None]:
job_des_df.head()

In [None]:
job_title_df = pd.DataFrame(job_title_tvec.todense(), columns=t_vect.get_feature_names())

In [None]:
df_1.head()

In [None]:
df_2 = df_1.drop(['job_description_all_text', 'job_title'],axis=1)

In [None]:
df_2.head()

In [None]:
df_2 = df_2.join(job_des_df)

In [None]:
df_2.head()

In [None]:
df_2.isna().sum()

In [None]:
rating_avg = np.mean(df.company_rating)

In [None]:
df_2.company_rating.fillna(rating_avg,inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_2,y_binary)

In [None]:
clasif_pipe = make_pipeline(Normalizer(),LGBMClassifier(max_depth=6,n_estimators=500,num_leaves=127,learning_rate=0.05,n_jobs=-1))

In [None]:
clasif_params = {'lgbmclassifier__max_depth': [5,7,12], 'lgbmclassifier__num_leaves': [255, 511, 1023, 2047], 'lgbmclassifier__n_estimators': [100,500,1000,1500]}

In [None]:
classif_grid = GridSearchCV(clasif_pipe,clasif_params,verbose=2,n_jobs=-1)

In [None]:
classif_grid.fit(X_train,y_train)

In [None]:
classif_grid.best_params_

In [None]:
clasif_pipe.fit(X_train.values,y_train.values)

In [None]:
clasif_pipe.score(X_test.values,y_test.values)

In [None]:
y_binary.value_counts(normalize=True)

In [None]:
import eli5

In [None]:
eli5.explain_weights(clasif_pipe[-1],feature_names=df_2.columns.tolist())

In [None]:
df_2.columns