<a href="https://colab.research.google.com/github/SimonHeilles/OC/blob/main/P5_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 5.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 33.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 44.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
from collections import Counter
import tensorflow as tf
import sklearn

#Preprocessing
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

#Train-test split
from sklearn.model_selection import train_test_split

#LDA
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#Feature Extractions
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.models import Word2Vec
import tensorflow_hub as hub
from transformers import (
    DistilBertTokenizerFast,
    TFDistilBertModel,
    DistilBertConfig,
)

#Predictions
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


#Metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss

# Import
url = 'https://raw.githubusercontent.com/SimonHeilles/OC/main/QueryResults%20(2).csv'
data = pd.read_csv(url)
df = data.copy()

# Pre-processing
df['Body'] = df['Body'].apply(lambda x: BeautifulSoup(x).get_text())

df['Tags'] = df['Tags'].str.split().str.join(" ")

spec_chars0 = ["<", ">"]

for char in spec_chars0:
    df['Tags'] = df['Tags'].str.replace(char, ' ')

df['Tags'] = df['Tags'].str.split().str.join(" ")


text_columns = df[['Title', 'Body']]

for column in text_columns:
  df[column] = df[column].str.lower()

for column in text_columns:
  spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–", "$", "0", "1",
              "2", "3", "4", "5", "6", "7", "8", "9"]

for char in spec_chars:
    df[column] = df[column].str.replace(char, ' ')

for column in text_columns:
  df[column] = df[column].str.split().str.join(" ")

df2 = df.copy()
cachedStopWords = stopwords.words("english")

for column in text_columns:
  df2[column] = df2[column].apply(lambda x: [str(word) for word in word_tokenize(x) if not word in cachedStopWords])

for column in text_columns:
  df2[column] = df2[column].apply(lambda x: ' '.join(x))

# NB : no stemming, doesn't really increase the results

# Preparing the list of tags
df_cv = df2.copy()
df_cv['TitleBody'] = df_cv['Title'] + ' ' + df_cv['Body']
tags_list = []

for words_list in df_cv['Tags']:
  tags_list.append(words_list.split())

flat_list = [item for sublist in tags_list for item in sublist]

Counter = Counter(flat_list)

no_words = 20 # number of words we accept in the list of tags

most_occur = Counter.most_common(no_words)
fdist=dict(zip(*np.unique(most_occur, return_counts=True)))
list_tags = list(fdist)[-no_words:]

df_cv['Tags2'] = df_cv['Tags'].apply(lambda x: [tag for tag in list_tags if tag in x.split(" ")])

index_list = []

for i, row in df_cv.iterrows():
  if len(row['Tags2']) == 0:
    index_list.append(i) 

df_cv.drop(index_list, axis=0, inplace=True)

X = df_cv[['TitleBody']]
y = df_cv[['Tags2']]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=17)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=list_tags)
y_train = mlb.fit_transform(y_train['Tags2'])
y_test = mlb.transform(y_test['Tags2'])

mlb.classes_

array(['584', 'android', 'asp.net', 'c', 'c#', 'c++', 'cocoa-touch',
       'html', 'ios', 'iphone', 'java', 'javascript', 'jquery', 'linux',
       'objective-c', 'performance', 'php', 'python', 'sql', 'windows'],
      dtype=object)

In [None]:
#mlb.classes_ = np.delete(mlb.classes_, [0])
#mlb.classes_

# Bag of word

In [None]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

count_vect = CountVectorizer(max_features=1000, binary=True)

X_train_counts = count_vect.fit_transform(X_train0['TitleBody'])
X_test_counts = count_vect.transform(X_test0['TitleBody']) # transform seulement

## Réduction dimensionnelle

In [None]:
qt = [d.split() for d in X_train0['TitleBody']]
gensim_dictionary = corpora.Dictionary(qt)
texts = qt
gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]
print(gensim_corpus[:3])

[[(gensim_dictionary[id], freq) for id, freq in cp] for cp in gensim_corpus[:4]] 

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 4), (7, 1), (8, 1), (9, 1), (10, 4), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 3), (18, 3), (19, 1), (20, 2), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1), (30, 3), (31, 3), (32, 3), (33, 4), (34, 3), (35, 1), (36, 1), (37, 1), (38, 1), (39, 5), (40, 5), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 2), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 5), (57, 4), (58, 14), (59, 1), (60, 1), (61, 1)], [(23, 2), (62, 1), (63, 9), (64, 1), (65, 2), (66, 1), (67, 1), (68, 1), (69, 1), (70, 2), (71, 4), (72, 2), (73, 3), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 3), (83, 1), (84, 1), (85, 1), (86, 1), (87, 2), (88, 1), (89, 1), (90, 6), (91, 1), (92, 1), (93, 2), (94, 1), (95, 2), (96, 3), (97, 1)], [(54, 1), (62, 1), (98, 2), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 5), (106, 1), (107, 

[[('according', 1),
  ('achieve', 1),
  ('anyone', 1),
  ('border', 2),
  ('code', 1),
  ('colored', 1),
  ('colorize', 4),
  ('compute', 1),
  ('created', 1),
  ('data', 1),
  ('diagram', 4),
  ('docs', 1),
  ('e', 1),
  ('figure', 1),
  ('fill', 2),
  ('forming', 1),
  ('help', 1),
  ('image', 3),
  ('import', 3),
  ('indicates', 1),
  ('indices', 2),
  ('ints', 1),
  ('list', 2),
  ('make', 1),
  ('matplotlib', 1),
  ('need', 1),
  ('np', 2),
  ('nregions', 1),
  ('numpy', 1),
  ('order', 1),
  ('outside', 3),
  ('plot', 3),
  ('plt', 3),
  ('points', 4),
  ('polygon', 3),
  ('pyplot', 1),
  ('rand', 1),
  ('random', 1),
  ('reasonably', 1),
  ('region', 5),
  ('regions', 5),
  ('remove', 1),
  ('resulting', 1),
  ('scipy', 2),
  ('see', 1),
  ('seem', 1),
  ('set', 1),
  ('shape', 1),
  ('show', 1),
  ('spatial', 2),
  ('tesselation', 1),
  ('think', 1),
  ('tried', 1),
  ('trying', 1),
  ('using', 1),
  ('vertex', 1),
  ('vertices', 5),
  ('vor', 4),
  ('voronoi', 14),
  ('well', 

In [None]:
'''
i = 1

while i < 10: #calculating and displaying the coherence score
  lda_model = gensim.models.ldamodel.LdaModel(
    corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=i, random_state=100, 
    update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True
  )

  coherence_model_lda = CoherenceModel(
    model=lda_model, texts=qt, dictionary=gensim_dictionary, coherence='c_v')
  
  coherence_lda = coherence_model_lda.get_coherence()
  print('\nCoherence Score :', coherence_lda, '// i =', i)
  i = i + 1
'''

"\ni = 1\n\nwhile i < 10: #calculating and displaying the coherence score\n  lda_model = gensim.models.ldamodel.LdaModel(\n    corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=i, random_state=100, \n    update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True\n  )\n\n  coherence_model_lda = CoherenceModel(\n    model=lda_model, texts=qt, dictionary=gensim_dictionary, coherence='c_v')\n  \n  coherence_lda = coherence_model_lda.get_coherence()\n  print('\nCoherence Score :', coherence_lda, '// i =', i)\n  i = i + 1\n"

Coherence Score : 0.36568419717585815 // i = 1

Coherence Score : 0.3937934338982566 // i = 2

Coherence Score : 0.4713507152241547 // i = 3

Coherence Score : 0.48230897208942974 // i = 4

Coherence Score : 0.4919717130549627 // i = 5

Coherence Score : 0.47621885201636943 // i = 6

Coherence Score : 0.4395823651640512 // i = 7

Coherence Score : 0.5053347873156604 // i = 8

Coherence Score : 0.4772248376021915 // i = 9

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(
        n_components=8, # 8 is the best feat according to the results above (highest coherence score is the 7th iteration)
        max_iter=5, 
        learning_method='online', 
        learning_offset=50.,
        random_state=0)

lda.fit(X_train_counts)

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          max_iter=5, n_components=8, random_state=0)

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        #print(topic)

no_top_words = 10
display_topics(lda, count_vect.get_feature_names_out(), no_top_words) # we can adjust the output by playing with max features

Topic 0:
return new class code get using string public void method
Topic 1:
programmatically total memory usage select bytes iphone know datetime yes
Topic 2:
use one like would using way code know question also
Topic 3:
net convert asp characters string file web mvc character config
Topic 4:
bundle activity super android override os layout view screen protected
Topic 5:
hash change django default forms without purpose model different allowed
Topic 6:
import column numpy row like print data columns python output
Topic 7:
using like use way code get want would app need


1.   Topic 0 : le vocabulaire autour des objets et des méthodes en programmation.
2.   Topic 1 : une thématique plus portée sur le stockage mémoire et un outsider (iphone)
3.   Topic 2 : assez générique, du texte simple
4.   Topic 3 : le développement web avec asp.net
5.   Topic 4 : développement mobile
6.   Topic 5 : développement web avec django
7.   Topic 6 : librairie numpy
8.   Topic 7 : mots simples



# Prédictions

In [None]:
jscore = make_scorer(jaccard_score, average='micro')

def preds(X_train1, X_test1, y_train1, y_test1):
  lr_parameters = {"estimator__C": np.logspace(-3,3,7)}

  knn_parameters = {'estimator__n_neighbors' : [3, 5, 11, 19]}

  rfc_parameters = { 
      'estimator__n_estimators': [1, 5, 10], # 5, 10, 15
      'estimator__max_features': ['auto', 'log2'],
      'estimator__max_depth' : [10, 30, 50] # 5, 10, 15
  }

  lr = OneVsRestClassifier(LogisticRegression(max_iter=100))
  lr_tuning = GridSearchCV(lr, param_grid=lr_parameters,
                             scoring=jscore, cv=3)

  lr_tuning.fit(X_train1, y_train1)
  pred=lr_tuning.predict(X_test1)

  print('\nLogistic Regression:')
  print(lr_tuning.best_score_)
  print(lr_tuning.best_params_)
  print('Jaccard score', jaccard_score(y_test1,pred, average='micro'))
  print('Hamming loss', hamming_loss(y_test1, pred))
  print('F1 score', f1_score(y_test1,pred, average='micro'), '\n')
  f1_lr = f1_score(y_test1,pred, average='micro')

  knn = OneVsRestClassifier(KNeighborsClassifier())
  knn_tuning = GridSearchCV(knn, param_grid=knn_parameters,
                             scoring=jscore, cv=3)
  
  knn_tuning.fit(X_train1, y_train1)
  pred=knn_tuning.predict(X_test1)
  '''
  print('\nKNN:')
  print(knn_tuning.best_score_)
  print(knn_tuning.best_params_)
  print('Jaccard score', jaccard_score(y_test1,pred, average='micro'))
  print('Hamming loss', hamming_loss(y_test1, pred))
  print('F1 score', f1_score(y_test1,pred, average='micro'), '\n')
  f1_knn = f1_score(y_test1,pred, average='micro')

  rfc=OneVsRestClassifier(RandomForestClassifier(random_state=42))
  rfc_tuning = GridSearchCV(rfc, param_grid=rfc_parameters,
                             scoring=jscore, cv=3)
  
  rfc_tuning.fit(X_train1, y_train1)
  pred=rfc_tuning.predict(X_test1)

  print('\nRandom Forest Classifier:')
  print(rfc_tuning.best_score_)
  print(rfc_tuning.best_params_)
  print('Jaccard score', jaccard_score(y_test1,pred, average='micro'))
  print('Hamming loss', hamming_loss(y_test1, pred))
  print('F1 score', f1_score(y_test1,pred, average='micro'), '\n')
  f1_rfc = f1_score(y_test1,pred, average='micro')
  results_f1 = [f1_lr, f1_knn, f1_rfc]
  '''
  return lr_tuning

#mlb.inverse_transform(pred)
selected_model = preds(X_train_counts, X_test_counts, y_train, y_test)

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.


Logistic Regression:
0.39722188201932523
{'estimator__C': 10.0}
Jaccard score 0.4083646616541353
Hamming loss 0.049295223179326546
F1 score 0.5799132465799133 



  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


In [None]:
with open('model_bag_of_words.pkl', 'wb') as f:
    pickle.dump((mlb, selected_model), f)

# Word2Vec

In [None]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

X_train0['TitleBody'] = X_train0['TitleBody'].apply(lambda x: x.split())
wv = Word2Vec(X_train0['TitleBody'], min_count=2)

def get_vect(word, model):
    try:
        return model.wv[word]
    except KeyError:
        return np.zeros((model.vector_size,))

def sum_vectors(phrase, model):
    return sum(get_vect(w, model) for w in phrase)

def word2vec_features(X, model):
    feats = np.vstack([sum_vectors(p, model) for p in X])
    return feats

wv_train_feat = word2vec_features(X_train0["TitleBody"], wv)
wv_test_feat = word2vec_features(X_test0["TitleBody"], wv)

## Prédictions

In [None]:
preds(wv_train_feat, wv_test_feat, y_train, y_test)
#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:



Logistic Regression:
0.20993833683101723
{'estimator__C': 1.0}
Jaccard score 0.03874300473525613
Hamming loss 0.17486296006264682
F1 score 0.07459593866556154 



  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])



KNN:
0.10360101756993495
{'estimator__n_neighbors': 3}
Jaccard score 0.04032258064516129
Hamming loss 0.09784651527016444
F1 score 0.07751937984496124 



  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.


Random Forest Classifier:
0.1282553798074925
{'estimator__max_depth': 30, 'estimator__max_features': 'auto', 'estimator__n_estimators': 1}
Jaccard score 0.05685664004331935
Hamming loss 0.27278778386844166
F1 score 0.10759574740617395 



0.10759574740617395

# USE

In [None]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

#X_train0['TitleBody'] = X_train0['TitleBody'].apply(lambda x: [x])

X_train_embed = X_train0['TitleBody'].to_list()
X_train_embed = embed(X_train_embed)
X_train_embed = np.array(X_train_embed)

X_test_embed = X_test0['TitleBody'].to_list()
X_test_embed = embed(X_test_embed)
X_test_embed = np.array(X_test_embed)

## Prédictions

In [None]:
selected_model = preds(X_train_embed, X_test_embed, y_train, y_test)
#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.


Logistic Regression:
0.5003124089604681
{'estimator__C': 100.0}
Jaccard score 0.4995247148288973
Hamming loss 0.04122944400939702
F1 score 0.6662440570522979 



  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


In [None]:
with open('use_lr.pkl', 'wb') as f:
    pickle.dump((mlb, selected_model), f)

## BERT

In [None]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

# Using DistilBERT:
model_class, tokenizer_class, pretrained_weights = (TFDistilBertModel, DistilBertTokenizerFast, 'distilbert-base-uncased')

pretrained_bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

def get_pretrained_bert_model(config=pretrained_weights):
    if not config:
        config = DistilBertConfig(num_labels=2)

    return model_class.from_pretrained(pretrained_weights, config=config)

def tokenize_encode(questions, max_length=None):
    return pretrained_bert_tokenizer(
        questions,
        add_special_tokens=True,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="tf",
    )

# need to be explicit about the lengths (instead of just specifying padding=True in the tokenizer)
# otherwise train questions end up being 71 and validation questions end up as 70, which causes problems/warnings
max_length_question = 72
max_length_keyword = 8

train_questions_encoded = tokenize_encode(X_train0["TitleBody"].to_list(), max_length_question) 
validation_questions_encoded = tokenize_encode(X_test0["TitleBody"].to_list(), max_length_question) 
train_inputs_encoded = dict(train_questions_encoded)
validation_inputs_encoded = dict(validation_questions_encoded)

train_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(train_questions_encoded), y_train))

val_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(validation_questions_encoded), y_test))

train_multi_input_dataset = tf.data.Dataset.from_tensor_slices(
    (train_inputs_encoded, y_train))

val_multi_input_dataset = tf.data.Dataset.from_tensor_slices(
    (validation_inputs_encoded, y_test))

tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    min_df=1, ngram_range=(1, 1), norm="l2")

train_vectors = tfidf_vectorizer.fit_transform(raw_documents=X_train0["TitleBody"]).toarray()
validation_vectors = tfidf_vectorizer.transform(X_test0["TitleBody"]).toarray()

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

## Prédictions

In [None]:
preds(train_vectors, validation_vectors, y_train, y_test)
#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.


Logistic Regression:
0.3330090125222063
{'estimator__C': 1000.0}
Jaccard score 0.4127234490010515
Hamming loss 0.043735317149569304
F1 score 0.5842947525120953 



  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])



KNN:
0.3005766909727892
{'estimator__n_neighbors': 3}
Jaccard score 0.3140877598152425
Hamming loss 0.058144087705559906
F1 score 0.4780316344463971 



  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.


Random Forest Classifier:
0.1866504997453352
{'estimator__max_depth': 50, 'estimator__max_features': 'auto', 'estimator__n_estimators': 10}
Jaccard score 0.19868995633187772
Hamming loss 0.05747846515270164
F1 score 0.33151183970856096 



0.5842947525120953