<a href="https://colab.research.google.com/github/SimonHeilles/OC/blob/main/P5_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
from collections import Counter
import tensorflow as tf
import sklearn

#Preprocessing
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

#Train-test split
from sklearn.model_selection import train_test_split

#LDA
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#Feature Extractions
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.models import Word2Vec
import tensorflow_hub as hub
from transformers import (
    DistilBertTokenizerFast,
    TFDistilBertModel,
    DistilBertConfig,
)

#Predictions
from sklearn.multiclass import OneVsRestClassifier

from sklearn.ensemble import RandomForestClassifier

#Metrics
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss

# Import
url = 'https://raw.githubusercontent.com/SimonHeilles/OC/main/QueryResults%20(2).csv'
data = pd.read_csv(url)
df = data.copy()

# Pre-processing
df['Body'] = df['Body'].apply(lambda x: BeautifulSoup(x).get_text())
df['Tags'] = df['Tags'].str.replace("<", ' ')
df['Tags'] = df['Tags'].str.replace(">", ' ')
df['Tags'] = df['Tags'].str.split().str.join(" ")

text_columns = df[['Title', 'Body']]

for column in text_columns:
  df[column] = df[column].str.lower()

for column in text_columns:
  spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–", "$"]

for char in spec_chars:
    df[column] = df[column].str.replace(char, ' ')

for column in text_columns:
  df[column] = df[column].str.split().str.join(" ")

df2 = df.copy()
cachedStopWords = stopwords.words("english")

for column in text_columns:
  df2[column] = df2[column].apply(lambda x: [str(word) for word in word_tokenize(x) if not word in cachedStopWords])

for column in text_columns:
  df2[column] = df2[column].apply(lambda x: ' '.join(x))

# NB : no stemming, doesn't really increase the results

# Preparing the list of tags
df_cv = df2.copy()
df_cv['TitleBody'] = df_cv['Title'] + ' ' + df_cv['Body']
tags_list = []

for words_list in df_cv['Tags']:
  tags_list.append(words_list.split())

flat_list = [item for sublist in tags_list for item in sublist]

Counter = Counter(flat_list)

no_words = 20 # number of words we accept in the list of tags

most_occur = Counter.most_common(no_words)
fdist=dict(zip(*np.unique(most_occur, return_counts=True)))
list_tags = list(fdist)[-no_words:]

df_cv['Tags2'] = df_cv['Tags'].apply(lambda x: [tag for tag in list_tags if tag in x.split(" ")])

index_list = []

for i, row in df_cv.iterrows():
  if len(row['Tags2']) == 0:
    index_list.append(i) 

df_cv.drop(index_list, axis=0, inplace=True)

X = df_cv[['TitleBody']]
y = df_cv[['Tags2']]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=17)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=list_tags)
y_train = mlb.fit_transform(y_train['Tags2'])
y_test = mlb.transform(y_test['Tags2'])

mlb.classes_

array(['584', 'android', 'asp.net', 'c', 'c#', 'c++', 'cocoa-touch',
       'html', 'ios', 'iphone', 'java', 'javascript', 'jquery', 'linux',
       'objective-c', 'performance', 'php', 'python', 'sql', 'windows'],
      dtype=object)

# Bag of word

In [4]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

count_vect = CountVectorizer(max_features=1000, binary=True)

X_train_counts = count_vect.fit_transform(X_train0['TitleBody'])
X_test_counts = count_vect.transform(X_test0['TitleBody']) # transform seulement

## Réduction dimensionnelle

In [5]:
qt = [d.split() for d in X_train0['TitleBody']]
gensim_dictionary = corpora.Dictionary(qt)
texts = qt
gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]
print(gensim_corpus[:3])

[[(gensim_dictionary[id], freq) for id, freq in cp] for cp in gensim_corpus[:4]] 

[[(0, 3), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 4), (11, 1), (12, 1), (13, 1), (14, 4), (15, 1), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 3), (22, 3), (23, 1), (24, 2), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 3), (35, 3), (36, 3), (37, 4), (38, 3), (39, 1), (40, 1), (41, 1), (42, 1), (43, 5), (44, 5), (45, 1), (46, 1), (47, 2), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 2), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 5), (61, 4), (62, 14), (63, 1), (64, 1), (65, 1)], [(27, 2), (66, 1), (67, 9), (68, 1), (69, 2), (70, 1), (71, 1), (72, 1), (73, 1), (74, 2), (75, 4), (76, 2), (77, 3), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 3), (87, 1), (88, 1), (89, 1), (90, 1), (91, 2), (92, 1), (93, 1), (94, 6), (95, 1), (96, 1), (97, 2), (98, 1), (99, 2), (100, 3), (101, 1)], [(0, 1), (58, 1), (66, 1), (102, 2), (103, 1), (104, 1), (105, 1), (106, 1)

[[('1', 3),
  ('15', 1),
  ('2', 1),
  ('2d', 2),
  ('according', 1),
  ('achieve', 1),
  ('anyone', 1),
  ('border', 2),
  ('code', 1),
  ('colored', 1),
  ('colorize', 4),
  ('compute', 1),
  ('created', 1),
  ('data', 1),
  ('diagram', 4),
  ('docs', 1),
  ('e', 1),
  ('figure', 1),
  ('fill', 2),
  ('forming', 1),
  ('help', 1),
  ('image', 3),
  ('import', 3),
  ('indicates', 1),
  ('indices', 2),
  ('ints', 1),
  ('list', 2),
  ('make', 1),
  ('matplotlib', 1),
  ('need', 1),
  ('np', 2),
  ('nregions', 1),
  ('numpy', 1),
  ('order', 1),
  ('outside', 3),
  ('plot', 3),
  ('plt', 3),
  ('points', 4),
  ('polygon', 3),
  ('pyplot', 1),
  ('rand', 1),
  ('random', 1),
  ('reasonably', 1),
  ('region', 5),
  ('regions', 5),
  ('remove', 1),
  ('resulting', 1),
  ('scipy', 2),
  ('see', 1),
  ('seem', 1),
  ('set', 1),
  ('shape', 1),
  ('show', 1),
  ('spatial', 2),
  ('tesselation', 1),
  ('think', 1),
  ('tried', 1),
  ('trying', 1),
  ('using', 1),
  ('vertex', 1),
  ('vertices'

In [6]:
i = 1

while i < 10: #calculating and displaying the coherence score
  lda_model = gensim.models.ldamodel.LdaModel(
    corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=i, random_state=100, 
    update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True
  )

  coherence_model_lda = CoherenceModel(
    model=lda_model, texts=qt, dictionary=gensim_dictionary, coherence='c_v')
  
  coherence_lda = coherence_model_lda.get_coherence()
  print('\nCoherence Score :', coherence_lda, '// i =', i)
  i = i + 1


Coherence Score : 0.36568419717585815 // i = 1

Coherence Score : 0.3937934338982566 // i = 2

Coherence Score : 0.4713507152241547 // i = 3

Coherence Score : 0.48230897208942974 // i = 4

Coherence Score : 0.4919717130549627 // i = 5

Coherence Score : 0.47621885201636943 // i = 6

Coherence Score : 0.4395823651640512 // i = 7

Coherence Score : 0.5053347873156604 // i = 8

Coherence Score : 0.4772248376021915 // i = 9


In [7]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(
        n_components=8, # 8 is the best feat according to the results above (highest coherence score is the 7th iteration)
        max_iter=5, 
        learning_method='online', 
        learning_offset=50.,
        random_state=0)

lda.fit(X_train_counts)

array([[ 0.15521493,  0.42677282,  0.16711351, ...,  0.17815212,
         0.15227167,  0.16285152],
       [ 5.13533656, 12.44873603, 12.91274601, ..., 18.12142426,
        22.31397659,  6.30779503],
       [17.27110235, 65.92815089, 17.62842508, ...,  2.10825935,
        20.31648797, 49.34491423],
       ...,
       [ 0.59921003, 14.54610027, 40.45331463, ..., 18.02052474,
        19.48397963,  2.55916438],
       [ 1.25917293, 41.52452993, 12.30689525, ...,  0.15608339,
         0.89125629,  7.81036768],
       [ 0.15249747,  0.3164751 ,  0.3878486 , ...,  0.15413686,
         0.15278786,  0.15168206]])

In [8]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        #print(topic)

no_top_words = 10
display_topics(lda, count_vect.get_feature_names_out(), no_top_words) # we can adjust the output by playing with max features

Topic 0:
equivalent php use mac go line using allows specific hash
Topic 1:
new class get public code using return string function use
Topic 2:
using use like way would get file code need one
Topic 3:
table sql database select query column data visual using like
Topic 4:
image view ios height width like text screen size set
Topic 5:
code int use one using like way return function performance
Topic 6:
32 convert bit output need 64 got 16 display float


OBSOLETE : On a du développement (web en partie avec deux langages web) avec git, et plutôt du vocabulaire avec tout ce qui concerne les bases de données.

# Prédictions

In [9]:
rfc=OneVsRestClassifier(RandomForestClassifier(random_state=42))

rfc.fit(X_train_counts, y_train)

pred=rfc.predict(X_test_counts)

print('Jaccard score', jaccard_score(y_test,pred, average='micro'))
print('Hamming loss', hamming_loss(y_test, pred))
print('F1 score', f1_score(y_test,pred, average='micro'))

#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])


Jaccard score 0.29797703663203934
Hamming loss 0.05027407987470634
F1 score 0.4591406908171863


# Word2Vec

In [10]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

X_train0['TitleBody'] = X_train0['TitleBody'].apply(lambda x: x.split())
wv = Word2Vec(X_train0['TitleBody'], min_count=2)

def get_vect(word, model):
    try:
        return model.wv[word]
    except KeyError:
        return np.zeros((model.vector_size,))

def sum_vectors(phrase, model):
    return sum(get_vect(w, model) for w in phrase)

def word2vec_features(X, model):
    feats = np.vstack([sum_vectors(p, model) for p in X])
    return feats

wv_train_feat = word2vec_features(X_train0["TitleBody"], wv)
wv_test_feat = word2vec_features(X_test0["TitleBody"], wv)

## Prédictions

In [11]:
rfc=OneVsRestClassifier(RandomForestClassifier(random_state=42))

rfc.fit(wv_train_feat, y_train)

pred=rfc.predict(wv_test_feat)

print('Jaccard score', jaccard_score(y_test,pred, average='micro'))
print('Hamming loss', hamming_loss(y_test, pred))
print('F1 score', f1_score(y_test,pred, average='micro'))

#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])


Jaccard score 0.017671517671517672
Hamming loss 0.0740015661707126
F1 score 0.03472931562819203


# USE

In [12]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

#X_train0['TitleBody'] = X_train0['TitleBody'].apply(lambda x: [x])

X_train_embed = X_train0['TitleBody'].to_list()
X_train_embed = embed(X_train_embed)
X_train_embed = np.array(X_train_embed)

X_test_embed = X_test0['TitleBody'].to_list()
X_test_embed = embed(X_test_embed)
X_test_embed = np.array(X_test_embed)

## Prédictions

In [13]:
rfc=OneVsRestClassifier(RandomForestClassifier(random_state=42))

rfc.fit(X_train_embed, y_train)

pred=rfc.predict(X_test_embed)

print('Jaccard score', jaccard_score(y_test,pred, average='micro'))
print('Hamming loss', hamming_loss(y_test, pred))
print('F1 score', f1_score(y_test,pred, average='micro'))

#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])


Jaccard score 0.3656957928802589
Hamming loss 0.04604541895066562
F1 score 0.5355450236966824


## BERT

In [14]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

# Using DistilBERT:
model_class, tokenizer_class, pretrained_weights = (TFDistilBertModel, DistilBertTokenizerFast, 'distilbert-base-uncased')

pretrained_bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

def get_pretrained_bert_model(config=pretrained_weights):
    if not config:
        config = DistilBertConfig(num_labels=2)

    return model_class.from_pretrained(pretrained_weights, config=config)

def tokenize_encode(questions, max_length=None):
    return pretrained_bert_tokenizer(
        questions,
        add_special_tokens=True,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="tf",
    )

# need to be explicit about the lengths (instead of just specifying padding=True in the tokenizer)
# otherwise train questions end up being 71 and validation questions end up as 70, which causes problems/warnings
max_length_question = 72
max_length_keyword = 8

train_questions_encoded = tokenize_encode(X_train0["TitleBody"].to_list(), max_length_question) 
validation_questions_encoded = tokenize_encode(X_test0["TitleBody"].to_list(), max_length_question) 
train_inputs_encoded = dict(train_questions_encoded)
validation_inputs_encoded = dict(validation_questions_encoded)

train_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(train_questions_encoded), y_train))

val_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(validation_questions_encoded), y_test))

train_multi_input_dataset = tf.data.Dataset.from_tensor_slices(
    (train_inputs_encoded, y_train))

val_multi_input_dataset = tf.data.Dataset.from_tensor_slices(
    (validation_inputs_encoded, y_test))

tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    min_df=1, ngram_range=(1, 1), norm="l2")

train_vectors = tfidf_vectorizer.fit_transform(raw_documents=X_train0["TitleBody"]).toarray()
validation_vectors = tfidf_vectorizer.transform(X_test0["TitleBody"]).toarray()

## Prédictions

In [15]:
rfc=OneVsRestClassifier(RandomForestClassifier(random_state=42))

rfc.fit(X_train_embed, y_train)

pred=rfc.predict(X_test_embed)

print('Jaccard score', jaccard_score(y_test,pred, average='micro'))
print('Hamming loss', hamming_loss(y_test, pred))
print('F1 score', f1_score(y_test,pred, average='micro'))

#mlb.inverse_transform(pred)

  "Label %s is present in all training examples." % str(classes[c])


Jaccard score 0.3656957928802589
Hamming loss 0.04604541895066562
F1 score 0.5355450236966824
