<a href="https://colab.research.google.com/github/SimonHeilles/OC/blob/main/P5_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install transformers

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
from collections import Counter
import tensorflow as tf
import sklearn

#Preprocessing
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

#Train-test split
from sklearn.model_selection import train_test_split

#LDA
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#Feature Extractions
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.models import Word2Vec
import tensorflow_hub as hub
from transformers import (
    DistilBertTokenizerFast,
    TFDistilBertModel,
    DistilBertConfig,
)

#Predictions
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#Metrics
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss

# Import
url = 'https://raw.githubusercontent.com/SimonHeilles/OC/main/QueryResults%20(2).csv'
data = pd.read_csv(url)
df = data.copy()

# Pre-processing
df['Body'] = df['Body'].apply(lambda x: BeautifulSoup(x).get_text())

df['Tags'] = df['Tags'].str.split().str.join(" ")

spec_chars0 = ["<", ">", "8"]

for char in spec_chars0:
    df['Tags'] = df['Tags'].str.replace(char, ' ')

df['Tags'] = df['Tags'].str.split().str.join(" ")


text_columns = df[['Title', 'Body']]

for column in text_columns:
  df[column] = df[column].str.lower()

for column in text_columns:
  spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–", "$", "0", "1",
              "2", "3", "4", "5", "6", "7", "8", "9"]

for char in spec_chars:
    df[column] = df[column].str.replace(char, ' ')

for column in text_columns:
  df[column] = df[column].str.split().str.join(" ")

df2 = df.copy()
cachedStopWords = stopwords.words("english")

for column in text_columns:
  df2[column] = df2[column].apply(lambda x: [str(word) for word in word_tokenize(x) if not word in cachedStopWords])

for column in text_columns:
  df2[column] = df2[column].apply(lambda x: ' '.join(x))

# NB : no stemming, doesn't really increase the results

# Preparing the list of tags
df_cv = df2.copy()
df_cv['TitleBody'] = df_cv['Title'] + ' ' + df_cv['Body']
tags_list = []

for words_list in df_cv['Tags']:
  tags_list.append(words_list.split())

flat_list = [item for sublist in tags_list for item in sublist]

Counter = Counter(flat_list)

no_words = 20 # number of words we accept in the list of tags

most_occur = Counter.most_common(no_words)
fdist=dict(zip(*np.unique(most_occur, return_counts=True)))
list_tags = list(fdist)[-no_words:]

df_cv['Tags2'] = df_cv['Tags'].apply(lambda x: [tag for tag in list_tags if tag in x.split(" ")])

index_list = []

for i, row in df_cv.iterrows():
  if len(row['Tags2']) == 0:
    index_list.append(i) 

df_cv.drop(index_list, axis=0, inplace=True)

X = df_cv[['TitleBody']]
y = df_cv[['Tags2']]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=17)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=list_tags)
y_train = mlb.fit_transform(y_train['Tags2'])
y_test = mlb.transform(y_test['Tags2'])

mlb.classes_

array(['584', 'android', 'asp.net', 'c', 'c#', 'c++', 'cocoa-touch',
       'html', 'ios', 'iphone', 'java', 'javascript', 'jquery', 'linux',
       'objective-c', 'performance', 'php', 'python', 'sql', 'windows'],
      dtype=object)

In [4]:
mlb.classes_ = np.delete(mlb.classes_, [0])
mlb.classes_

array(['android', 'asp.net', 'c', 'c#', 'c++', 'cocoa-touch', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'linux',
       'objective-c', 'performance', 'php', 'python', 'sql', 'windows'],
      dtype=object)

# Bag of word

In [5]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

count_vect = CountVectorizer(max_features=1000, binary=True)

X_train_counts = count_vect.fit_transform(X_train0['TitleBody'])
X_test_counts = count_vect.transform(X_test0['TitleBody']) # transform seulement

## Réduction dimensionnelle

In [6]:
qt = [d.split() for d in X_train0['TitleBody']]
gensim_dictionary = corpora.Dictionary(qt)
texts = qt
gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]
print(gensim_corpus[:3])

[[(gensim_dictionary[id], freq) for id, freq in cp] for cp in gensim_corpus[:4]] 

[[(0, 1), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 4), (19, 1), (20, 1), (21, 1), (22, 1), (23, 3), (24, 4), (25, 2), (26, 1)], [(3, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 4), (33, 1), (34, 1), (35, 1), (36, 4), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 3), (44, 3), (45, 1), (46, 2), (47, 1), (48, 2), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 3), (57, 3), (58, 3), (59, 4), (60, 3), (61, 1), (62, 1), (63, 1), (64, 1), (65, 5), (66, 5), (67, 1), (68, 1), (69, 2), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 2), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 5), (83, 4), (84, 14), (85, 1), (86, 1), (87, 1)], [(0, 1), (3, 1), (34, 2), (70, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 2), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 6), (102, 2), (103, 1), (104, 1), (105, 1), (

[[('?', 1),
  ('assign', 2),
  ('assigned', 2),
  ('code', 1),
  ('compiler', 1),
  ('complement', 1),
  ('could', 1),
  ('curious', 1),
  ('error', 1),
  ('gets', 1),
  ('give', 1),
  ('happen', 1),
  ('happens', 1),
  ('int', 1),
  ('know', 1),
  ('like', 1),
  ('look', 1),
  ('negative', 2),
  ('nval', 4),
  ('program', 1),
  ('ran', 1),
  ('somewhat', 1),
  ('strange', 1),
  ('unsigned', 3),
  ('value', 4),
  ('variable', 2),
  ('would', 1)],
 [('code', 1),
  ('according', 1),
  ('achieve', 1),
  ('anyone', 1),
  ('border', 2),
  ('colored', 1),
  ('colorize', 4),
  ('compute', 1),
  ('created', 1),
  ('data', 1),
  ('diagram', 4),
  ('docs', 1),
  ('e', 1),
  ('figure', 1),
  ('fill', 2),
  ('forming', 1),
  ('help', 1),
  ('image', 3),
  ('import', 3),
  ('indicates', 1),
  ('indices', 2),
  ('ints', 1),
  ('list', 2),
  ('make', 1),
  ('matplotlib', 1),
  ('need', 1),
  ('np', 2),
  ('nregions', 1),
  ('numpy', 1),
  ('order', 1),
  ('outside', 3),
  ('plot', 3),
  ('plt', 3),
 

In [7]:
'''
i = 1

while i < 10: #calculating and displaying the coherence score
  lda_model = gensim.models.ldamodel.LdaModel(
    corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=i, random_state=100, 
    update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True
  )

  coherence_model_lda = CoherenceModel(
    model=lda_model, texts=qt, dictionary=gensim_dictionary, coherence='c_v')
  
  coherence_lda = coherence_model_lda.get_coherence()
  print('\nCoherence Score :', coherence_lda, '// i =', i)
  i = i + 1
'''

"\ni = 1\n\nwhile i < 10: #calculating and displaying the coherence score\n  lda_model = gensim.models.ldamodel.LdaModel(\n    corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=i, random_state=100, \n    update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True\n  )\n\n  coherence_model_lda = CoherenceModel(\n    model=lda_model, texts=qt, dictionary=gensim_dictionary, coherence='c_v')\n  \n  coherence_lda = coherence_model_lda.get_coherence()\n  print('\nCoherence Score :', coherence_lda, '// i =', i)\n  i = i + 1\n"

Coherence Score : 0.36568419717585815 // i = 1

Coherence Score : 0.3937934338982566 // i = 2

Coherence Score : 0.4713507152241547 // i = 3

Coherence Score : 0.48230897208942974 // i = 4

Coherence Score : 0.4919717130549627 // i = 5

Coherence Score : 0.47621885201636943 // i = 6

Coherence Score : 0.4395823651640512 // i = 7

Coherence Score : 0.5053347873156604 // i = 8

Coherence Score : 0.4772248376021915 // i = 9

In [8]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(
        n_components=8, # 8 is the best feat according to the results above (highest coherence score is the 7th iteration)
        max_iter=5, 
        learning_method='online', 
        learning_offset=50.,
        random_state=0)

lda.fit(X_train_counts)

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          max_iter=5, n_components=8, random_state=0)

In [9]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        #print(topic)

no_top_words = 10
display_topics(lda, count_vect.get_feature_names_out(), no_top_words) # we can adjust the output by playing with max features

Topic 0:
element function color change jquery div style elements left equal
Topic 1:
using like way data table would values need get value
Topic 2:
ios app view iphone screen image xcode bar want show
Topic 3:
using like way use code would get want http need
Topic 4:
std performance struct boost operator mean compiler high const gcc
Topic 5:
lang java native exception com org invoke util unknown thread
Topic 6:
code class return void public get new int string method
Topic 7:
use one using project build file need also know vs


OBSOLETE : On a du développement (web en partie avec deux langages web) avec git, et plutôt du vocabulaire avec tout ce qui concerne les bases de données.

# Prédictions

In [18]:
from sklearn.metrics import make_scorer
jscore = make_scorer(f1_score, average='micro')

def preds(X_train1, X_test1, y_train1, y_test1):
  lr_parameters = {"estimator__C": np.logspace(-3,3,7)}

  knn_parameters = {'estimator__n_neighbors' : [3, 5, 11, 19]}

  rfc_parameters = { 
      'estimator__n_estimators': [1, 5, 10],
      'estimator__max_features': ['auto', 'log2'],
      'estimator__max_depth' : [10, 30, 50]
  }

  lr = OneVsRestClassifier(LogisticRegression(max_iter=100))
  lr_tuning = GridSearchCV(lr, param_grid=lr_parameters,
                             scoring=jscore, cv=3)

  lr_tuning.fit(X_train1, y_train1)
  pred=lr_tuning.predict(X_test1)

  print('\nLogistic Regression:')
  print(lr_tuning.best_score_)
  print(lr_tuning.best_params_)
  print('Jaccard score', jaccard_score(y_test1,pred, average='micro'))
  print('Hamming loss', hamming_loss(y_test1, pred))
  print('F1 score', f1_score(y_test1,pred, average='micro'), '\n')

  knn = OneVsRestClassifier(KNeighborsClassifier())
  knn_tuning = GridSearchCV(knn, param_grid=knn_parameters,
                             scoring=jscore, cv=3)
  
  knn_tuning.fit(X_train1, y_train1)
  pred=knn_tuning.predict(X_test1)

  print('\nKNN:')
  print(knn_tuning.best_score_)
  print(knn_tuning.best_params_)
  print('Jaccard score', jaccard_score(y_test1,pred, average='micro'))
  print('Hamming loss', hamming_loss(y_test1, pred))
  print('F1 score', f1_score(y_test1,pred, average='micro'), '\n')

  rfc=OneVsRestClassifier(RandomForestClassifier(random_state=42))
  rfc_tuning = GridSearchCV(rfc, param_grid=rfc_parameters,
                             scoring=jscore, cv=3)
  
  rfc_tuning.fit(X_train1, y_train1)
  pred=rfc_tuning.predict(X_test1)

  print('\nRandom Forest Classifier:')
  print(rfc_tuning.best_score_)
  print(rfc_tuning.best_params_)
  print('Jaccard score', jaccard_score(y_test1,pred, average='micro'))
  print('Hamming loss', hamming_loss(y_test1, pred))
  print('F1 score', f1_score(y_test1,pred, average='micro'), '\n')

#mlb.inverse_transform(pred)
preds(X_train_counts, X_test_counts, y_train, y_test)

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.


Logistic Regression:
0.5672310038290721
{'estimator__C': 100.0}
Jaccard score 0.41288045875606527
Hamming loss 0.05211433046202036
F1 score 0.5844520761785826 



  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])



KNN:
0.13260623086023013
{'estimator__n_neighbors': 3}
Jaccard score 0.08130502330398758
Hamming loss 0.06945967110415036
F1 score 0.1503831417624521 



  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.


Random Forest Classifier:
0.4177734881410209
{'estimator__max_depth': 50, 'estimator__max_features': 'auto', 'estimator__n_estimators': 5}
Jaccard score 0.2822420634920635
Hamming loss 0.05665622552858261
F1 score 0.44023210831721465 



# Word2Vec

In [None]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

X_train0['TitleBody'] = X_train0['TitleBody'].apply(lambda x: x.split())
wv = Word2Vec(X_train0['TitleBody'], min_count=2)

def get_vect(word, model):
    try:
        return model.wv[word]
    except KeyError:
        return np.zeros((model.vector_size,))

def sum_vectors(phrase, model):
    return sum(get_vect(w, model) for w in phrase)

def word2vec_features(X, model):
    feats = np.vstack([sum_vectors(p, model) for p in X])
    return feats

wv_train_feat = word2vec_features(X_train0["TitleBody"], wv)
wv_test_feat = word2vec_features(X_test0["TitleBody"], wv)

## Prédictions

In [None]:
preds(wv_train_feat, wv_test_feat, y_train, y_test)
#mlb.inverse_transform(pred)

# USE

In [None]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

#X_train0['TitleBody'] = X_train0['TitleBody'].apply(lambda x: [x])

X_train_embed = X_train0['TitleBody'].to_list()
X_train_embed = embed(X_train_embed)
X_train_embed = np.array(X_train_embed)

X_test_embed = X_test0['TitleBody'].to_list()
X_test_embed = embed(X_test_embed)
X_test_embed = np.array(X_test_embed)

## Prédictions

In [None]:
preds(X_train_embed, X_test_embed, y_train, y_test)
#mlb.inverse_transform(pred)

## BERT

In [None]:
X_train0 = X_train.copy()
X_test0 = X_test.copy()

# Using DistilBERT:
model_class, tokenizer_class, pretrained_weights = (TFDistilBertModel, DistilBertTokenizerFast, 'distilbert-base-uncased')

pretrained_bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

def get_pretrained_bert_model(config=pretrained_weights):
    if not config:
        config = DistilBertConfig(num_labels=2)

    return model_class.from_pretrained(pretrained_weights, config=config)

def tokenize_encode(questions, max_length=None):
    return pretrained_bert_tokenizer(
        questions,
        add_special_tokens=True,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="tf",
    )

# need to be explicit about the lengths (instead of just specifying padding=True in the tokenizer)
# otherwise train questions end up being 71 and validation questions end up as 70, which causes problems/warnings
max_length_question = 72
max_length_keyword = 8

train_questions_encoded = tokenize_encode(X_train0["TitleBody"].to_list(), max_length_question) 
validation_questions_encoded = tokenize_encode(X_test0["TitleBody"].to_list(), max_length_question) 
train_inputs_encoded = dict(train_questions_encoded)
validation_inputs_encoded = dict(validation_questions_encoded)

train_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(train_questions_encoded), y_train))

val_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(validation_questions_encoded), y_test))

train_multi_input_dataset = tf.data.Dataset.from_tensor_slices(
    (train_inputs_encoded, y_train))

val_multi_input_dataset = tf.data.Dataset.from_tensor_slices(
    (validation_inputs_encoded, y_test))

tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    min_df=1, ngram_range=(1, 1), norm="l2")

train_vectors = tfidf_vectorizer.fit_transform(raw_documents=X_train0["TitleBody"]).toarray()
validation_vectors = tfidf_vectorizer.transform(X_test0["TitleBody"]).toarray()

## Prédictions

In [None]:
preds(train_vectors, validation_vectors, y_train, y_test)
#mlb.inverse_transform(pred)