In [None]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import nltk
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
df = pd.read_csv('data_bank.csv')
df.drop(['posting_date','company_rating','company','location','url'],inplace = True,axis=1)

In [None]:
def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    text = text.lower()
    text = re.sub('[.,:;_%©?*,!@#$%^&()]|[+=]|[[]|[]]|[/]|"|\s{2,}|-', ' ', text)
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    d = {'1':'one','2':'two','3':'three','4':'four','5':'five','6':'six','7':'seven','8':'eight','9':'nine'}
    text = ''.join([d.get(word, word) for word in text])
    text = ' '.join(word for word in text.split() if len(word)>1)
    return text

def get_prepared_text(text):
    text = clean_text(text)
    words = nltk.word_tokenize(text)
    without_stop_words = ''
    for word in words:
        if word not in stop_words and re.search('[a-zA-Z]',word):
            without_stop_words += word + ' '
    return without_stop_words

def get_vectorizer(df,min_df, max_df, ngram_range):
  tfidf_vectorizer = TfidfVectorizer(min_df=min_df, max_df = max_df, ngram_range=ngram_range,stop_words=stop_words,use_idf = True)
  msg = []
  for i, row in df.iterrows():
    line = row[df.columns[0]].lower().strip()
    msg.append(line)
  tfidf_vectorizer.fit(msg) 
  return tfidf_vectorizer

In [None]:
%%time
stop_words = set(stopwords.words("english"))
to_stop = pd.DataFrame(df['text'])
to_stop['text'] = to_stop['text'].map(lambda x: get_prepared_text(x))
tfidf_vectorizer = get_vectorizer(to_stop,min_df=0.3, max_df = 1.0, ngram_range=(1,1))
s = tfidf_vectorizer.get_feature_names()
for i in s: 
  stop_words.add(i)
print('Words added to stoplist: '+str(len(s))) 

data = pd.DataFrame(df['description'])
data['description'] = data.description.map(lambda x: get_prepared_text(x))
tfidf_vectorizer = get_vectorizer(data,min_df=0.01, max_df=0.5, ngram_range=(1,1))
msg = []
for i, row in data.iterrows():
    line = row[data.columns[0]].lower().strip()
    msg.append(line)
tfidf_matrix = tfidf_vectorizer.transform(msg)
print(tfidf_matrix.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    tfidf_matrix.toarray(), 
    df['target'].values, 
    test_size=0.2, 
    shuffle=True)


In [None]:
grid = {
    'loss_function' : ['CrossEntropy','LogLoss'],
    'eval_metric' : ['F1','Recall','Accuracy'],
    'max_depth' : [6,8,10],
    'boosting_type' : ['Ordered'],
    'num_trees' : [750,1000],
    'learning_rate' : [0.003,0.005,0.007],
}

from itertools import product
all_params = [dict(zip(grid, v)) for v in product(*grid.values())]

In [None]:
from tqdm import tqdm
import random
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from catboost import CatBoostClassifier

# sample = random.sample(all_params,150)
results = []
for params in tqdm(all_params):
  loss_function = params['loss_function']
  eval_metric = params['eval_metric']
  max_depth = params['max_depth']
  boosting_type = params['boosting_type']
  num_trees = params['num_trees']
  learning_rate = params['learning_rate']

  print(params)
  model = CatBoostClassifier(verbose=False, 
                             loss_function=loss_function, 
                             eval_metric=eval_metric,
                             max_depth=max_depth, 
                             boosting_type=boosting_type, 
                             num_trees=num_trees,
                             learning_rate=learning_rate,
                             task_type="GPU")
  model.fit(X_train,y_train)
  pred = model.predict(X_test)

  acc = accuracy_score(pred,y_test)
  rec = recall_score(pred,y_test)
  prec = precision_score(pred,y_test)
  f1 = f1_score(pred,y_test)
  
  results.append([acc,rec,prec,f1])
  print()
  print('Accuracy ', acc)
  print('Recall ', rec)
  print('Precision ', prec)
  print('F1 ', f1)


In [None]:
res = pd.DataFrame(columns=['acc','rec','prec','f1'], data = results)
with plt.style.context('fivethirtyeight'):
  plt.figure(figsize=(16, 6))
  plt.plot(res.index, res.acc, label='acc')
  plt.plot(res.index, res.rec, label='rec')
  plt.plot(res.index, res.prec, label='prec')
  plt.plot(res.index, res.f1, label='f1')
  plt.legend(loc='upper right')

In [None]:
model = CatBoostClassifier(loss_function='CrossEntropy', eval_metric= 'Accuracy', max_depth =  10, 
                           boosting_type = 'Ordered', num_trees = 850, learning_rate = 0.003,task_type="GPU")
model.fit(tfidf_matrix.toarray(),df['target'].values)

import pickle
filename = 'model.pkl'
pickle.dump(model, open(filename, 'wb'))
files.download(filename)