In [1]:
import pandas as pd
import numpy as np
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

In [3]:
df = pd.read_csv("dataset/large_dataset_with_labels.csv", index_col=0)

In [4]:
df['label'] = df['label'].apply(lambda x: '[' + '\'' + str(x)[1:-1] + '\'' + ']')
df['label'] = df['label'].apply(lambda x: ast.literal_eval(x))

In [5]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['label'])
pd.DataFrame(y, columns=multilabel.classes_)

Unnamed: 0,.NET,Android,Artist,Business Analyst,C++,Data Analyst,Data Engineer,Data Science,Design,DevOps,...,Sales,Salesforce,Scala,Scrum Master,Security,Support,Sysadmin,Technical Writing,Unity,iOS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9402,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
tfidf = TfidfVectorizer(analyzer='word', max_features=10000, ngram_range=(1,1))
X = tfidf.fit_transform(df['description'])
X.shape, y.shape

((9404, 10000), (9404, 41))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['description'], df['label'], test_size = 0.2, random_state = 15)
y_train.value_counts()

label
[Marketing]            820
[Other]                661
[JavaScript]           546
[Sales]                483
[DevOps]               405
[PHP]                  395
[Java]                 291
[Node.js]              288
[.NET]                 275
[C++]                  245
[Python]               227
[Design]               221
[Support]              219
[Project Manager]      209
[Business Analyst]     192
[QA Automation]        175
[Product Manager]      143
[Lead Generation]      132
[QA]                   128
[Sysadmin]             121
[Data Engineer]        110
[Artist]               105
[Data Analyst]         103
[Golang]               100
[SEO]                  100
[HR]                    81
[Lead]                  73
[Unity]                 72
[SQL]                   69
[Android]               66
[Data Science]          65
[Recruiter]             65
[iOS]                   64
[Security]              63
[Ruby]                  49
[Technical Writing]     49
[Flutter]             

In [8]:
oversample = SMOTE()
X_resamlpled, y_resampled = oversample.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_resamlpled, y_resampled, test_size = 0.2, random_state = 15)

In [9]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()
forest = RandomForestClassifier()

In [10]:
def j_score(y_true, y_pred):
  jaccard = np.minimum(y_true, y_pred).sum(axis = 1)/np.maximum(y_true, y_pred).sum(axis = 1)
  return jaccard.mean()*100


def print_score(y_pred, clf):
  print("Clf: ", clf.__class__.__name__)
  print('Jacard score: {}'.format(j_score(y_test, y_pred)))
  print('----')

In [11]:
from sklearn.metrics import classification_report

for classifier in [sgd, lr, svc]:
  clf = OneVsRestClassifier(classifier)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(classifier)
  print(classification_report(y_test, y_pred))
  # print_score(y_pred, classifier)

SGDClassifier()
              precision    recall  f1-score   support

           0       0.99      0.77      0.87       218
           1       0.99      0.99      0.99       229
           2       0.97      0.93      0.95       202
           3       0.99      0.80      0.89       189
           4       0.98      0.80      0.88       212
           5       0.97      0.92      0.94       223
           6       0.95      0.85      0.90       214
           7       0.98      0.94      0.96       186
           8       0.99      0.89      0.94       217
           9       0.98      0.91      0.94       188
          10       0.98      0.98      0.98       187
          11       1.00      0.89      0.94       192
          12       1.00      0.98      0.99       216
          13       0.99      0.81      0.89       212
          14       0.93      0.53      0.67       207
          15       0.97      0.88      0.93       210
          16       0.92      0.90      0.91       209
          1

  _warn_prf(average, modifier, msg_start, len(result))


LogisticRegression()
              precision    recall  f1-score   support

           0       0.99      0.65      0.78       218
           1       1.00      0.95      0.97       229
           2       0.99      0.85      0.91       202
           3       1.00      0.69      0.82       189
           4       0.98      0.61      0.75       212
           5       0.97      0.83      0.90       223
           6       0.94      0.70      0.80       214
           7       0.98      0.87      0.92       186
           8       1.00      0.75      0.85       217
           9       0.99      0.77      0.87       188
          10       0.99      0.94      0.97       187
          11       0.98      0.78      0.87       192
          12       1.00      0.85      0.92       216
          13       1.00      0.69      0.82       212
          14       0.90      0.39      0.54       207
          15       0.99      0.75      0.85       210
          16       0.92      0.81      0.87       209
      

  _warn_prf(average, modifier, msg_start, len(result))


LinearSVC()
              precision    recall  f1-score   support

           0       0.99      0.93      0.96       218
           1       0.99      1.00      0.99       229
           2       0.98      1.00      0.99       202
           3       0.99      0.96      0.98       189
           4       0.99      0.96      0.97       212
           5       0.97      1.00      0.99       223
           6       0.98      0.99      0.98       214
           7       0.98      0.99      0.98       186
           8       0.99      0.97      0.98       217
           9       0.99      0.96      0.98       188
          10       0.99      1.00      1.00       187
          11       0.97      0.99      0.98       192
          12       1.00      1.00      1.00       216
          13       0.99      0.91      0.95       212
          14       0.96      0.72      0.82       207
          15       0.98      0.98      0.98       210
          16       0.95      0.99      0.97       209
          17   

  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
for classifier in [LinearSVC(C=2, penalty='l2', dual=False)]:
  clf = OneVsRestClassifier(classifier)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  # print_score(y_pred, classifier)
  print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.94      0.96       218
           1       0.99      1.00      0.99       229
           2       0.99      1.00      1.00       202
           3       0.99      0.97      0.98       189
           4       0.99      0.97      0.98       212
           5       0.97      1.00      0.99       223
           6       0.98      1.00      0.99       214
           7       0.99      0.99      0.99       186
           8       0.99      0.97      0.98       217
           9       0.99      0.98      0.98       188
          10       0.99      1.00      1.00       187
          11       0.97      0.99      0.98       192
          12       1.00      1.00      1.00       216
          13       0.99      0.92      0.95       212
          14       0.95      0.80      0.87       207
          15       0.97      0.98      0.97       210
          16       0.95      1.00      0.97       209
          17       0.87    

  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
x = ['jetpack kotlin mvvm']
xt = tfidf.transform(x)

decision_values = clf.decision_function(xt)
top_3 = np.argsort(decision_values, axis=1)[:, -3:]


print(multilabel.inverse_transform(clf.predict(xt)))
top_3[0][1]
for i in top_3[0]:
    print(multilabel.classes_[i])

[('Android',)]
Marketing
Other
Android


In [18]:
import joblib

clf = OneVsRestClassifier(LinearSVC(C=2, penalty='l2', dual=False))
clf.fit(X_train, y_train)

joblib.dump(clf, 'skills_analyze_model.pkl')

['skills_analyze_model.pkl']