<a href="https://colab.research.google.com/github/SimonHeilles/OC/blob/main/P5_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
from collections import Counter
import tensorflow as tf
import sklearn

#Preprocessing
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

#Train-test split
from sklearn.model_selection import train_test_split

#Feature Extractions
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.models import Word2Vec
import tensorflow_hub as hub
from transformers import (
    DistilBertTokenizerFast,
    TFDistilBertModel,
    DistilBertConfig,
)

#Predictions
from sklearn.multiclass import OneVsRestClassifier

from sklearn.ensemble import RandomForestClassifier

#Metrics
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss

# Import
url = 'https://raw.githubusercontent.com/SimonHeilles/OC/main/QueryResults%20(2).csv'
data = pd.read_csv(url)
df = data.copy()

# Pre-processing
df['Body'] = df['Body'].apply(lambda x: BeautifulSoup(x).get_text())
df['Tags'] = df['Tags'].str.replace("<", ' ')
df['Tags'] = df['Tags'].str.replace(">", ' ')
df['Tags'] = df['Tags'].str.split().str.join(" ")

text_columns = df[['Title', 'Body']]

for column in text_columns:
  df[column] = df[column].str.lower()

for column in text_columns:
  spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–", "$"]

for char in spec_chars:
    df[column] = df[column].str.replace(char, ' ')

for column in text_columns:
  df[column] = df[column].str.split().str.join(" ")

df2 = df.copy()
cachedStopWords = stopwords.words("english")

for column in text_columns:
  df2[column] = df2[column].apply(lambda x: [str(word) for word in word_tokenize(x) if not word in cachedStopWords])

for column in text_columns:
  df2[column] = df2[column].apply(lambda x: ' '.join(x))

# Preparing the list of tags
df_cv = df2.copy()
df_cv['TitleBody'] = df_cv['Title'] + ' ' + df_cv['Body']
tags_list = []

for words_list in df_cv['Tags']:
  tags_list.append(words_list.split())

flat_list = [item for sublist in tags_list for item in sublist]

Counter = Counter(flat_list)

no_words = 20 # number of words we accept in the list of tags

most_occur = Counter.most_common(no_words)
fdist=dict(zip(*np.unique(most_occur, return_counts=True)))
list_tags = list(fdist)[-no_words:]

df_cv['Tags2'] = df_cv['Tags'].apply(lambda x: [tag for tag in list_tags if tag in x.split(" ")])

index_list = []

for i, row in df_cv.iterrows():
  if len(row['Tags2']) == 0:
    index_list.append(i) 

df_cv.drop(index_list, axis=0, inplace=True)

X = df_cv[['TitleBody']]
y = df_cv[['Tags2']]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=17)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=list_tags)
y_train = mlb.fit_transform(y_train['Tags2'])
y_test = mlb.transform(y_test['Tags2'])

mlb.classes_

array(['584', 'android', 'asp.net', 'c', 'c#', 'c++', 'cocoa-touch',
       'html', 'ios', 'iphone', 'java', 'javascript', 'jquery', 'linux',
       'objective-c', 'performance', 'php', 'python', 'sql', 'windows'],
      dtype=object)

# Bag of word

In [4]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

count_vect = CountVectorizer(max_features=1000, binary=True)

X_train_counts = count_vect.fit_transform(X_train0['TitleBody'])
X_test_counts = count_vect.transform(X_test0['TitleBody']) # transform seulement

# Prédictions

In [5]:
rfc=OneVsRestClassifier(RandomForestClassifier(random_state=42))

rfc.fit(X_train_counts, y_train)

pred=rfc.predict(X_test_counts)

print('Jaccard score', jaccard_score(y_test,pred, average='micro'))
print('Hamming loss', hamming_loss(y_test, pred))
print('F1 score', f1_score(y_test,pred, average='micro'))

#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])


Jaccard score 0.29797703663203934
Hamming loss 0.05027407987470634
F1 score 0.4591406908171863


# Word2Vec

In [6]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

X_train0['TitleBody'] = X_train0['TitleBody'].apply(lambda x: x.split())
wv = Word2Vec(X_train0['TitleBody'], min_count=2)

def get_vect(word, model):
    try:
        return model.wv[word]
    except KeyError:
        return np.zeros((model.vector_size,))

def sum_vectors(phrase, model):
    return sum(get_vect(w, model) for w in phrase)

def word2vec_features(X, model):
    feats = np.vstack([sum_vectors(p, model) for p in X])
    return feats

wv_train_feat = word2vec_features(X_train0["TitleBody"], wv)
wv_test_feat = word2vec_features(X_test0["TitleBody"], wv)

## Prédictions

In [7]:
rfc=OneVsRestClassifier(RandomForestClassifier(random_state=42))

rfc.fit(wv_train_feat, y_train)

pred=rfc.predict(wv_test_feat)

print('Jaccard score', jaccard_score(y_test,pred, average='micro'))
print('Hamming loss', hamming_loss(y_test, pred))
print('F1 score', f1_score(y_test,pred, average='micro'))

#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])


Jaccard score 0.013238289205702648
Hamming loss 0.07588097102584182
F1 score 0.026130653266331662


# USE

In [8]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

#X_train0['TitleBody'] = X_train0['TitleBody'].apply(lambda x: [x])

X_train_embed = X_train0['TitleBody'].to_list()
X_train_embed = embed(X_train_embed)
X_train_embed = np.array(X_train_embed)

X_test_embed = X_test0['TitleBody'].to_list()
X_test_embed = embed(X_test_embed)
X_test_embed = np.array(X_test_embed)

## Prédictions

In [9]:
rfc=OneVsRestClassifier(RandomForestClassifier(random_state=42))

rfc.fit(X_train_embed, y_train)

pred=rfc.predict(X_test_embed)

print('Jaccard score', jaccard_score(y_test,pred, average='micro'))
print('Hamming loss', hamming_loss(y_test, pred))
print('F1 score', f1_score(y_test,pred, average='micro'))

#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])


Jaccard score 0.3656957928802589
Hamming loss 0.04604541895066562
F1 score 0.5355450236966824


## BERT

In [10]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

# Using DistilBERT:
model_class, tokenizer_class, pretrained_weights = (TFDistilBertModel, DistilBertTokenizerFast, 'distilbert-base-uncased')

pretrained_bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

def get_pretrained_bert_model(config=pretrained_weights):
    if not config:
        config = DistilBertConfig(num_labels=2)

    return model_class.from_pretrained(pretrained_weights, config=config)

def tokenize_encode(tweets, max_length=None):
    return pretrained_bert_tokenizer(
        tweets,
        add_special_tokens=True,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="tf",
    )

# need to be explicit about the lengths (instead of just specifying padding=True in the tokenizer)
# otherwise train tweets end up being 71 and validation tweets end up as 70, which causes problems/warnings
max_length_tweet = 72
max_length_keyword = 8

train_tweets_encoded = tokenize_encode(X_train0["TitleBody"].to_list(), max_length_tweet) 
validation_tweets_encoded = tokenize_encode(X_test0["TitleBody"].to_list(), max_length_tweet) 
train_inputs_encoded = dict(train_tweets_encoded)
validation_inputs_encoded = dict(validation_tweets_encoded)

train_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(train_tweets_encoded), y_train))

val_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(validation_tweets_encoded), y_test))

train_multi_input_dataset = tf.data.Dataset.from_tensor_slices(
    (train_inputs_encoded, y_train))

val_multi_input_dataset = tf.data.Dataset.from_tensor_slices(
    (validation_inputs_encoded, y_test))

tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    min_df=1, ngram_range=(1, 1), norm="l2")

train_vectors = tfidf_vectorizer.fit_transform(raw_documents=X_train0["TitleBody"]).toarray()
validation_vectors = tfidf_vectorizer.transform(X_test0["TitleBody"]).toarray()

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

## Prédictions

In [11]:
rfc=OneVsRestClassifier(RandomForestClassifier(random_state=42))

rfc.fit(X_train_embed, y_train)

pred=rfc.predict(X_test_embed)

print('Jaccard score', jaccard_score(y_test,pred, average='micro'))
print('Hamming loss', hamming_loss(y_test, pred))
print('F1 score', f1_score(y_test,pred, average='micro'))

#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])


Jaccard score 0.3656957928802589
Hamming loss 0.04604541895066562
F1 score 0.5355450236966824
