# Test notebook

Trials of NLP analysis.

<a name="installs"></a>
## Installs

In [None]:
!pip uninstall helpers -y

In [None]:
!pip install git+https://github.com/Xmaster6y/ML-Engineer@develop

In [None]:
!pip install spacy

In [None]:
!pip install sentence_transformers

In [None]:
!pip install openai

<a name="imports"></a>
## Imports

In [None]:
import os
import pickle
import spacy
import json
from typing import List
from time import sleep

import pandas as pd
import numpy as np
from numpy.random import default_rng

import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import openai

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, RepeatedKFold
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, PCA, TruncatedSVD
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, multilabel_confusion_matrix, f1_score, recall_score

import torch
import torch.nn as nn
import torch.nn.functional as F

from sentence_transformers import SentenceTransformer

In [None]:
import helpers

In [None]:
dir(helpers)

In [None]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

## Data loading

### Loading

In [None]:
file_name = 'title_embedding_50k_500t.pkl'
drive_file_id = "1MJcLQdsZhV_icFNVSAXhT1z48y32MZea"
if not os.path.exists(file_name):
    !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=FILEID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$drive_file_id" -O $file_name  && rm -rf /tmp/cookies.txt

In [None]:
with open(file_name, 'rb') as f:
    X, Y, LABELS = pickle.load(f)

In [None]:
file_name = 'df_cleaned.csv'
drive_file_id = "1KeQu6tdb0qUXvpe2aB9ZKcQIYrkXk73e"
if not os.path.exists(file_name):
    !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=FILEID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$drive_file_id" -O $file_name  && rm -rf /tmp/cookies.txt

In [None]:
df_cleaned = pd.read_csv(file_name)
df_cleaned.head()

### Imputation

In [None]:
df_cleaned = df_cleaned.fillna("")
df_cleaned.info()

### Selection and splitting

In [None]:
N = 1000

rng = default_rng(seed=42)
sub_numbers = rng.choice(Y.shape[0], size=N, replace=False)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(torch.tensor(X[sub_numbers]), torch.tensor(Y[sub_numbers], dtype=torch.float32), test_size=0.5, random_state=42)

In [None]:
df_X_train, df_X_test = train_test_split(df_cleaned.loc[sub_numbers], test_size=0.5, random_state=42)

## Tag encoding / decoding

In [None]:
def tag_decoder(tags_arr):
    global LABELS
    if len(tags_arr.shape)==1:
        args = np.argwhere(tags_arr==1)
        return [LABELS[arg[0]] for arg in args]
    elif len(tags_arr.shape)==2:
        args = np.argwhere(tags_arr==1)
        decoded_tags = [[] for _ in range(tags_arr.shape[0])]
        for arg in args:
            decoded_tags[arg[0]].append(LABELS[arg[1]])
        return decoded_tags
    else:
        raise NotImplementedError

In [None]:
tag_encoder = MultiLabelBinarizer(classes=LABELS)
encoded_tags = tag_encoder.fit_transform([LABELS, LABELS])[:,:10]
encoded_tags

In [None]:
print(tag_decoder(encoded_tags))
print(tag_decoder(encoded_tags[0,:]))
z_tags = np.zeros((3,15))
print(tag_decoder(z_tags))
z_tags[0,5] = 1
print(tag_decoder(z_tags))

In [None]:
def to_name(encoded_tag):
    for i in range(len(encoded_tag)-1,-1,-1):
        if encoded_tag[i]:
            return LABELS[i]
    return "none"

train_pseudo_labels = list(map(to_name, Y_train[:,:10]))

## Text vectorisation

In [None]:
def lemmatize(text):
   doc = nlp(text)
   tokens = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]
   return ' '.join(tokens)

In [None]:
bow = CountVectorizer(preprocessor = lemmatize, max_features=500)
tfidf = TfidfVectorizer(preprocessor = lemmatize, max_features=500)

In [None]:
X_train_title_bow = bow.fit_transform(df_X_train["Title"])
title_bow_features = bow.get_feature_names_out()
X_train_title_tfidf = tfidf.fit_transform(df_X_train["Title"])
title_tfidf_features = tfidf.get_feature_names_out()
X_train_titlecode_bow = bow.fit_transform(df_X_train["Title"]+" "+df_X_train["BodyCode"])
titlecode_bow_features = bow.get_feature_names_out()
X_train_titlecode_tfidf = tfidf.fit_transform(df_X_train["Title"]+" "+df_X_train["BodyCode"])
titlecode_tfidf_features = tfidf.get_feature_names_out()

In [None]:
title_tfidf_features = tfidf.get_feature_names_out()

## Text Embedding

In [None]:
use = SentenceTransformer('sentence-transformers/use-cmlm-multilingual')
bert = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
minilm = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [None]:
X_train_title_use = use.encode(df_X_train["Title"].to_list())
X_train_title_bert = bert.encode(df_X_train["Title"].to_list())
X_train_title_minilm = minilm.encode(df_X_train["Title"].to_list())
X_train_title_mpnet = mpnet.encode(df_X_train["Title"].to_list())

X_train_code_minilm = minilm.encode(df_X_train["BodyCode"].to_list())

X_train_titlecode_use = use.encode((df_X_train["Title"]+"\n\n"+df_X_train["BodyCode"]).to_list())
X_train_titlecode_bert = bert.encode((df_X_train["Title"]+"\n\n"+df_X_train["BodyCode"]).to_list())
X_train_titlecode_minilm = minilm.encode((df_X_train["Title"]+"\n\n"+df_X_train["BodyCode"]).to_list())

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40.0, random_state=42)
z = tsne.fit_transform(X_train_title_use)

In [None]:
df = pd.DataFrame()
df["hue"] = train_pseudo_labels
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

sns.scatterplot(x="comp-1", y="comp-2", hue="hue",
                palette=sns.color_palette("hls", 11),
                data=df).set(title="T-SNE projection")

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40.0, random_state=42)
z = tsne.fit_transform(X_train_title_tfidf.toarray())

In [None]:
df = pd.DataFrame()
df["hue"] = train_pseudo_labels
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

sns.scatterplot(x="comp-1", y="comp-2", hue="hue",
                palette=sns.color_palette("hls", 11),
                data=df).set(title="T-SNE projection")

## Torch Multi-output

This section is mostly historical as it is one of the first model I implemented.

In [None]:
class MultiTagger(nn.Module):
    def __init__(self, n_in:int, n_out:int, hs:List[int] = None, labels:List[str] = None):
        super(MultiTagger, self).__init__()

        hs = hs or []
        in_sizes = [n_in] + hs
        out_sizes = hs + [n_out]

        layers = []
        for s_in, s_out in zip(in_sizes[:-1], out_sizes[:-1]):
            layers.append(nn.Linear(s_in, s_out))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(in_sizes[-1], out_sizes[-1]))
        layers.append(nn.Sigmoid())
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

In [None]:
n_in = X_train.shape[1]
n_out = 20
n_mean = (n_in+n_out)//2
clf_model = MultiTagger(n_in, n_out, hs=[n_mean])

In [None]:
X_train[:2].shape

In [None]:
clf_model.forward(X_train[:2]).shape

In [None]:
optimizer = torch.optim.Adam(clf_model.parameters(), lr=1e-2)
loss_fn = nn.BCELoss()

In [None]:
Y_pred = clf_model.forward(X_train)

In [None]:
loss_fn(Y_pred, Y_train[:, :n_out])

In [None]:
train_losses = []
test_losses = []
train_acc = []
test_acc = []

epochs = 100

for epoch in range(epochs):
    clf_model.train()
    optimizer.zero_grad()

    Y_pred = clf_model(X_train)
    loss = loss_fn(Y_pred, Y_train[:,:n_out])
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())
    Y_pred_round = torch.round(Y_pred).detach().numpy()
    train_acc.append(accuracy_score(Y_pred_round, Y_train[:,:n_out]))

    clf_model.eval()
    with torch.no_grad():
        Y_pred = clf_model(X_test)
        loss = loss_fn(Y_pred, Y_test[:,:n_out])

        test_losses.append(loss.item())
        Y_pred_round = torch.round(Y_pred).detach().numpy()
        test_acc.append(accuracy_score(Y_pred_round, Y_test[:,:n_out]))

    if (epoch + 1) % 10 == 0:
        print(
            f"epoch : {epoch+1}, "
            f"train loss : {train_losses[epoch]:.4f}, "
            f"train accuracy : {train_acc[epoch]:.2f}, "
            f"test loss : {test_losses[epoch]:.4f}, "
            f"test accuracy : {test_acc[epoch]:.2f}"
        )

In [None]:
Y_pred = clf_model.forward(X_train)
print(loss_fn(Y_pred, Y_train[:,:n_out]))
Y_pred_round = torch.round(Y_pred)

accuracy_score(Y_pred_round.detach().numpy(), Y_train[:,:n_out])

In [None]:
clf_model

## Title Embedding

Prediction only based on title embedding.

In [None]:
N_LABEL = 20

In [None]:
clf = MultiOutputClassifier(LogisticRegression())
clf.fit(X_train, Y_train[:, :N_LABEL])

In [None]:
Y_pred = clf.predict(X_train)
Y_pred_proba = clf.predict_proba(X_train)
Y_pred = torch.tensor(Y_pred, dtype=torch.float32)
Y_pred_indx = torch.tensor(Y_pred, dtype=torch.int64)
Y_pred_proba = torch.tensor(Y_pred_proba, dtype=torch.float32).T

In [None]:
Y_pred_proba.shape

In [None]:
Y_pred_proba_gather = Y_pred_proba.gather(0, Y_pred_indx.unsqueeze(0)).squeeze()
Y_pred_proba_gather.shape

In [None]:
loss_fn(Y_pred_proba_gather, Y_train[:, :N_LABEL])

In [None]:
loss_fn(Y_pred, Y_train[:, :N_LABEL])

In [None]:
accuracy_score(Y_pred, Y_train[:, :N_LABEL])

In [None]:
accuracy_score(np.zeros(Y_pred.shape), Y_train[:, :N_LABEL])

In [None]:
Y_pred = clf.predict(X_test)

In [None]:
accuracy_score(Y_pred, Y_test[:, :N_LABEL])

In [None]:
accuracy_score(np.zeros(Y_pred.shape), Y_test[:, :N_LABEL])

In [None]:
loss_fn(torch.tensor(Y_pred), Y_test[:, :N_LABEL])

## Vectorizer / Embedding comparison

### Dimension reduction

In [None]:
def grid_search(pipe, params, X_train, Y_train, cv=5):
    results = {}
    n_cv = len(params["reduc"]) * len(params["clf"])
    i=1
    for reduc in params["reduc"]:
        for clf in params["clf"]:
            print(f"[INFO] CV {i}/{n_cv}")
            pipe.set_params(reduc=reduc)
            pipe.set_params(clf=clf)
            cv_results = cross_validate(pipe, X_train, Y_train, cv=5, scoring="accuracy", return_train_score=True, return_estimator=True)
            name = str(pipe.get_params()["steps"])
            time = cv_results["fit_time"] + cv_results["score_time"]
            best_e = cv_results["estimator"][np.argmax(cv_results["test_score"])]
            results[name] = {
                "train_avg": np.mean(cv_results["train_score"]),
                "train_std":np.std(cv_results["train_score"]),
                "val_avg":np.mean(cv_results["test_score"]),
                "val_std":np.std(cv_results["test_score"]),
                "time_avg":np.mean(time),
                "time_std":np.std(time),
                "best_e":best_e
                }
            i+=1
    return results

def grid_evaluate(pipe, X_test, Y_test, metrics, cv=5, n_repeats=2):
    n_cv = cv * n_repeats
    kf = RepeatedKFold(n_splits=cv, n_repeats=n_repeats, random_state=42)
    scores = {m:[] for m in metrics.keys()}
    for i, (_, test_index) in enumerate(kf.split(X_test)):
        print(f"[INFO] CV {i+1}/{n_cv}")
        for m in metrics.keys():
            Y_pred = pipe.predict(X_test[test_index])
            scores[m].append(metrics[m](Y_test[test_index], Y_pred))
    return scores

In [None]:
mo_lr = MultiOutputClassifier(LogisticRegression(max_iter=1000))
cc_lr = ClassifierChain(LogisticRegression(max_iter=1000))
rf = RandomForestClassifier()

pipe = Pipeline([
    ('reduc', "passthrough"),
    ('clf', mo_lr)
])

params = {'clf':(mo_lr, cc_lr, rf),
        'reduc':(PCA(n_components=50), PCA(n_components=100), PCA(n_components=150), "passthrough")
        }

In [None]:
results = grid_search(pipe, params, X_train, Y_train[:, :N_LABEL], cv=3)
results

In [None]:
best_e = results["[('reduc', 'passthrough'), ('clf', ClassifierChain(base_estimator=LogisticRegression(max_iter=1000)))]"]['best_e']
def recall_w(*args, **kwargs):
    return recall_score(*args, **kwargs, average="weighted")
metrics = {
    "acc":accuracy_score,
    "recall_w": recall_w,
}
grid_evaluate(best_e, X_test, Y_test[:, :N_LABEL], metrics=metrics, cv=5)

In [None]:
params = {
    'clf':(mo_lr, cc_lr),
    'reduc':(TruncatedSVD(n_components=50), TruncatedSVD(n_components=100), TruncatedSVD(n_components=150), "passthrough")
}
results = grid_search(pipe, params, X_train_title_bow, Y_train[:, :N_LABEL], cv=3)
results

### Source Embeddings

In [None]:
X_test_title_bow = bow.fit_transform(df_X_test["Title"])
X_test_title_tfidf = tfidf.fit_transform(df_X_test["Title"])

X_test_title_use = use.encode(df_X_test["Title"].to_list())
X_test_title_bert = bert.encode(df_X_test["Title"].to_list())
X_test_title_minilm = minilm.encode(df_X_test["Title"].to_list())
X_test_title_mpnet = mpnet.encode(df_X_test["Title"].to_list())

In [None]:
def train_test_score(clf, X_train, Y_train, X_test, Y_test, metrics):
  clf.fit(X_train, Y_train)
  train_scores = {}
  for m in metrics.keys():
    Y_pred = clf.predict(X_train)
    train_scores[m] = metrics[m](Y_train, Y_pred)
  test_scores = grid_evaluate(clf, X_test, Y_test, metrics, cv=3)
  return train_scores, test_scores

In [None]:
clf = ClassifierChain(LogisticRegression(max_iter=1000))
train_scores, test_scores = train_test_score(clf, X_train_title_bow, Y_train[:, :N_LABEL], X_test_title_bow, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_title_tfidf, Y_train[:, :N_LABEL], X_test_title_tfidf, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_title_bert, Y_train[:, :N_LABEL], X_test_title_bert, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_title_use, Y_train[:, :N_LABEL], X_test_title_use, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_title_minilm, Y_train[:, :N_LABEL], X_test_title_minilm, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_title_mpnet, Y_train[:, :N_LABEL], X_test_title_mpnet, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

### Embedding combination

In [None]:
X_test_code_minilm = minilm.encode(df_X_test["BodyCode"].to_list())
X_test_code_mpnet = mpnet.encode(df_X_test["BodyCode"].to_list())

X_test_titlecode_minilm = minilm.encode((df_X_test["Title"]+"\n\n"+df_X_test["BodyCode"]).to_list())
X_test_titlecode_mpnet = mpnet.encode((df_X_test["Title"]+"\n\n"+df_X_test["BodyCode"]).to_list())

In [None]:
X_train_code_mpnet = mpnet.encode(df_X_train["BodyCode"].to_list())
X_test_code_mpnet = mpnet.encode(df_X_test["BodyCode"].to_list())
X_train_titlecode_mpnet = mpnet.encode((df_X_train["Title"]+"\n\n"+df_X_train["BodyCode"]).to_list())
X_test_titlecode_mpnet = mpnet.encode((df_X_test["Title"]+"\n\n"+df_X_test["BodyCode"]).to_list())

In [None]:
X_train_concat_minilm = np.concatenate([X_train_title_minilm,X_train_code_minilm], axis=1)
X_test_concat_minilm = np.concatenate([X_test_title_minilm,X_test_code_minilm], axis=1)

X_train_concat_mpnet = np.concatenate([X_train_title_mpnet,X_train_code_mpnet], axis=1)
X_test_concat_mpnet = np.concatenate([X_test_title_mpnet,X_test_code_mpnet], axis=1)

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_code_minilm, Y_train[:, :N_LABEL], X_test_code_minilm, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_titlecode_minilm, Y_train[:, :N_LABEL], X_test_titlecode_minilm, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_concat_minilm, Y_train[:, :N_LABEL], X_test_concat_minilm, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_code_mpnet, Y_train[:, :N_LABEL], X_test_code_mpnet, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_titlecode_mpnet, Y_train[:, :N_LABEL], X_test_titlecode_mpnet, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

In [None]:
train_scores, test_scores = train_test_score(clf, X_train_concat_mpnet, Y_train[:, :N_LABEL], X_test_concat_mpnet, Y_test[:, :N_LABEL], metrics)
train_scores, test_scores
print(f"Train scores: {train_scores}")
avg_test_scores = {k:f"{np.mean(v):.3f}+-{np.std(v):.3f}" for k,v in test_scores.items()}
print(f"AVG test scores: {avg_test_scores}")

## Export

In [None]:
clf = ClassifierChain(LogisticRegression(max_iter=1000))
clf.fit(X_train_concat_mpnet, Y_train[:, :N_LABEL])

In [None]:
with open("tag_list.pkl", "wb") as f:
    pickle.dump(LABELS, f)

with open("lr_20_model.pkl", "wb") as f:
    pickle.dump(clf, f)

## Unsupervised

### Clustering

In [None]:
model = KMeans(n_init='auto')
visualizer = KElbowVisualizer(model, k=(2,15), timings=False)

visualizer.fit(X_train_concat_mpnet)
visualizer.poof()

In [None]:
model = KMeans(n_init='auto')
visualizer = KElbowVisualizer(model, k=(2,15), metric='calinski_harabasz', timings=False)

visualizer.fit(X_train_concat_mpnet)
visualizer.poof()

In [None]:
k=5
model = KMeans(n_clusters=k, n_init='auto')
Y_pred = model.fit_predict(X_train_concat_mpnet)
for i in range(k):
    print(f"Cluster {i}")
    bow.fit_transform(df_X_train.loc[Y_pred==i,"Title"].to_list())
    bow_voc = bow.vocabulary_
    wordcloud = WordCloud(width= 1000, height = 600, max_words=100,
                      random_state=1, background_color='white', colormap='viridis_r',
                      collocations=False).generate_from_frequencies(bow_voc)
    plt.figure()
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

### Topic modelling

In [None]:
# Créer le modèle LDA
lda = LatentDirichletAllocation(
        n_components=N_LABEL,
        max_iter=5,
        learning_method='online',
        learning_offset=50.,
        random_state=0)

# Fitter sur les données
lda.fit(X_train_title_tfidf)

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, title_tfidf_features, no_top_words)


In [None]:
# Run NMF
nmf = NMF(n_components=N_LABEL, random_state=1, l1_ratio=.5, init='nndsvd')
nmf.fit(X_train_title_tfidf)

no_top_words = 10
display_topics(nmf, title_tfidf_features, no_top_words)


In [None]:
# Run NMF
nmf = NMF(n_components=N_LABEL, random_state=1, l1_ratio=.5, init='nndsvd')
nmf.fit(X_train_titlecode_tfidf)

no_top_words = 10
display_topics(nmf, titlecode_tfidf_features, no_top_words)

## Semi-supervised

### Setup & Prompt

In [None]:
openai.organization = "org-SS8XUJSjmT8NJjLeJCVZS9te"
openai.api_key = "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"

In [None]:
PROMPT_TEMPLATE = """
You will be provided with the following information:
1. A Stack Overflow question. The question is delimited with triple backticks. The question has three parts: title, body_text and body_code.
2. List of tags the question can be assigned to. The tags in the list are enclosed in the single quotes and comma separated.

Perform the following tasks:
1. Identify to which tags the provided question belongs to with the highest probability.
2. Assign the question to any tags based on the probabilities. If no tag is perninent don't asign any tag to the question.
3. Provide your response in a JSON format containing a single key `label` and a value corresponding to the array of assigned tags. Do not provide any additional information except the JSON.

List of tags: {labels}

Stack Overflow question:
```
[title]
{title}

[body_text]
{body_text}

[body_code]
{body_code}
```

Your JSON response:
"""

In [None]:
TURBO_COST_PROMPT = 0.0015 / 1000
TURBO_COST_COMPLETION = 0.002 / 1000
GPT4_COST_PROMPT = 0.03 / 1000
GPT4_COST_COMPLETION = 0.06 / 1000

In [None]:
def find_popular_tags(s):
    global LABELS
    if isinstance(s, list):
        return [(rs, LABELS.index(rs)) for rs in s if rs in LABELS]
    s = str(s)
    s.replace("'", '"')
    raw_splits = s.split('"')
    return [rs for rs in raw_splits if rs in LABELS]

def extract_tags(response):
    try:
        d=json.loads(response)
        try:
            tags = d["label"]
            return find_popular_tags(tags)
        except Exception as e:
            print(e)
        try:
            tags = d["labels"]
            return find_popular_tags(tags)
        except Exception as e:
            print(e)
            print(d)
    except Exception as e:
        print(e)
    return find_popular_tags(response)

def usage_to_cost(usage, model="gpt-3.5-turbo"):
    global TURBO_COST_PROMPT
    global TURBO_COST_COMPLETION
    global GPT4_COST_PROMPT
    global GPT4_COST_COMPLETION
    if "gpt-3.5-turbo-16k" in model:
        prompt_rate = 2 * TURBO_COST_PROMPT
        completion_rate = 2 * TURBO_COST_COMPLETION
    elif "gpt-3.5-turbo" in model:
        prompt_rate = TURBO_COST_PROMPT
        completion_rate = TURBO_COST_COMPLETION
    elif "gpt-4-32k" in model:
        prompt_rate = 2 * GPT4_COST_PROMPT
        completion_rate = 2 * GPT4_COST_COMPLETION
    elif "gpt-4" in model:
        prompt_rate = GPT4_COST_PROMPT
        completion_rate = GPT4_COST_COMPLETION
    else:
        raise NotImplementedError
    cost = usage["prompt_tokens"] * prompt_rate
    cost += usage["completion_tokens"] * completion_rate
    return cost

In [None]:
df_cleaned.loc[0, "Title"]

In [None]:
prompt = PROMPT_TEMPLATE.format(
    labels=LABELS[:10],
    title=df_cleaned.loc[0, "Title"],
    body_text=df_cleaned.loc[0, "BodyText"],
    body_code=df_cleaned.loc[0, "BodyCode"],
)
print(prompt)

In [None]:
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a text classification model."},
    {"role": "user", "content": prompt}
  ]
)

In [None]:
info = completion['choices'][0]['message']['content']
usage = completion["usage"]
cost = usage_to_cost(usage, model="gpt-3.5-turbo")
print(cost)

tags = extract_tags(info)
print(tags)
encoded_tags = tag_encoder.transform([[tag[0] for tag in tags]])
encoded_tags[0,:10]

### Evaluation

In [None]:
TOTAL_COST = 0
def make_predictions(model, numbers, n_label=20, freq=1):
    global TOTAL_COST
    global LABELS
    predicted_tags = []
    i=1
    for n in numbers:
        prompt = PROMPT_TEMPLATE.format(
            labels=LABELS[:n_label],
            title=df_cleaned.loc[n, "Title"],
            body_text=df_cleaned.loc[n, "BodyText"],
            body_code=df_cleaned.loc[n, "BodyCode"],
        )
        while True:
            try:
                completion = openai.ChatCompletion.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": "You are a text classification model."},
                        {"role": "user", "content": prompt}
                    ]
                )
                break
            except openai.error.RateLimitError:
                sleep(60)
        info = completion['choices'][0]['message']['content']
        usage = completion["usage"]
        cost = usage_to_cost(usage, model=model)
        TOTAL_COST += cost
        tags = extract_tags(info)
        predicted_tags.append(tags)
        if i%freq ==0:
            print(f"[INFO] Current total cost: {TOTAL_COST:.3f} ({i}/{len(numbers)} steps done)")
        i+=1
    return predicted_tags

In [None]:
N = 20
n_label = 20

rng = default_rng(seed=42)
numbers = rng.choice(Y.shape[0], size=N, replace=False)

In [None]:
predicted_tags = make_predictions("gpt-3.5-turbo", numbers, n_label=20)

In [None]:
tag_encoder.transform([['java', 'android', 'gradle',]])[:, :20]

In [None]:
tag_names = [[t[0] for t in tags] for tags in predicted_tags]
y_pred = tag_encoder.transform(tag_names)[:,:n_label]
print(tag_names)
y_true = Y[numbers,:n_label]
print(accuracy_score(y_pred, y_true))
print(recall_score(y_pred, y_true, average="weighted"))

In [None]:
predicted_tags = make_predictions("gpt-4", numbers, n_label=20)

In [None]:
tag_names = [[t[0] for t in tags] for tags in predicted_tags]
y_pred = tag_encoder.transform(tag_names)[:,:n_label]
print(tag_names)
y_true = Y[numbers,:n_label]
print(accuracy_score(y_pred, y_true))
print(recall_score(y_pred, y_true, average="weighted"))

In [None]:
n_label=500
predicted_tags = make_predictions("gpt-3.5-turbo", numbers, n_label=n_label)

In [None]:
tag_names = [[t[0] for t in tags] for tags in predicted_tags]
y_pred = tag_encoder.transform(tag_names)[:,:n_label]
print(tag_names)
y_true = Y[numbers,:n_label]
print(accuracy_score(y_pred, y_true))
print(recall_score(y_pred, y_true, average="weighted"))

In [None]:
tag_names = [[t[0] for t in tags] for tags in predicted_tags]
y_pred = tag_encoder.transform(tag_names)[:,:20]
print(tag_names)
y_true = Y[numbers,:20]
print(accuracy_score(y_pred, y_true))
print(recall_score(y_pred, y_true, average="weighted"))

In [None]:
predicted_tags = make_predictions("gpt-4", numbers, n_label=n_label)

In [None]:
tag_names = [[t[0] for t in tags] for tags in predicted_tags]
y_pred = tag_encoder.transform(tag_names)[:,:n_label]
print(tag_names)
y_true = Y[numbers,:n_label]
print(accuracy_score(y_pred, y_true))
print(recall_score(y_pred, y_true, average="weighted"))

In [None]:
tag_names = [[t[0] for t in tags] for tags in predicted_tags]
y_pred = tag_encoder.transform(tag_names)[:,:20]
print(tag_names)
y_true = Y[numbers,:20]
print(accuracy_score(y_pred, y_true))
print(recall_score(y_pred, y_true, average="weighted"))