# Exploration notebook

This notebook aims at first collecting a dataset of StackOverflow questions before doing a first exploratory analysis.

## Imports

In [None]:
!pip uninstall helpers -y

In [None]:
!pip install git+https://github.com/Xmaster6y/ML-Engineer@package

In [None]:
!pip install sentence_transformers

In [None]:
!pip install spacy

In [None]:
import os
import re
import pickle

import pandas as pd
import numpy as np
import spacy
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
from numpy.random import default_rng

from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE

from sklearn.preprocessing import MultiLabelBinarizer

from sentence_transformers import SentenceTransformer

In [None]:
import helpers

In [None]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

## Data loading

The dataset was generated using the stackoverflow API.

- Dataset `raw_data_50k` :

```sql
SELECT TOP 50000 Title, Body, Tags, Id, Score, ViewCount, AnswerCount
FROM Posts
WHERE (
  PostTypeId = 1 AND AcceptedAnswerId IS NOT NULL
  AND (LEN(Tags) - LEN(REPLACE(Tags, '<','')) >= 4)
  ) AND (
  Score > 20 AND ViewCount > 1000
  )
ORDER BY Score DESC
```

In [None]:
file_name = "raw_data_50k.csv"
drive_file_id = "1F_-hp4ERdnr7GrCAVSDn_M-NWUOCb_be"
if not os.path.exists(file_name):
    !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=FILEID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$drive_file_id" -O $file_name  && rm -rf /tmp/cookies.txt

In [None]:
df_50k = pd.read_csv(file_name)

In [None]:
df_50k.info()

In [None]:
df_50k.head()

## Tags Analysis

In [None]:
tags_s = df_50k["Tags"].to_list()

In [None]:
def tag_str_to_list(tag_s:str):
    return tag_s[1:-1].split('><')
tags_l = list(map(tag_str_to_list, tags_s))

In [None]:
tags_l[:10]

In [None]:
all_tags = sum(tags_l, start=[])

In [None]:
occ_tags = {}
for tag in all_tags:
    if tag not in occ_tags:
        occ_tags[tag] = 1
    else:
        occ_tags[tag] += 1
N = len(all_tags)
freq_tags = {t:o/N for t,o in occ_tags.items()}

In [None]:
sorted_tags = sorted([(f,t) for t,f in freq_tags.items()], reverse=True)
sorted_tags[:10]

In [None]:
len(sorted_tags)

In [None]:
sorted_tags[40:50]

In [None]:
sorted_tags[50:100]

In [None]:
_, popular_tags = zip(*sorted_tags[:500])

In [None]:
cleaned_tags_l = [
    [t for t in tag_l if t in popular_tags ] for tag_l in tags_l
]
cleaned_tags_l[:10]

## Tags Encoding

In [None]:
encoder = MultiLabelBinarizer(classes=popular_tags)
encoded_tags = encoder.fit_transform(cleaned_tags_l)
encoded_tags.shape

In [None]:
encoded_tags[:10]

## Tags correlation

In [None]:
corr = np.corrcoef(encoded_tags, rowvar=False)
mask = np.triu(np.ones_like(corr, dtype=bool))

In [None]:
sns.heatmap(corr, mask=mask)

In [None]:
np.triu([[1,2,3],[4,5,6],[7,8,9],[10,11,12]], 0)

In [None]:
len(np.argwhere(corr*(1-mask) > 0.62))

In [None]:
len(np.argwhere(corr*(1-mask) < -0.1))

In [None]:
indices = np.argwhere((corr*(1-mask) > 0.62) | (corr*(1-mask) < -0.1))
corr_tags = []
for x, y in indices:
    corr_tags.append((
        corr[x,y],
        popular_tags[x],
        popular_tags[y]
    ))
corr_tags.sort(reverse=True)
corr_tags

In [None]:
fig, ax = plt.subplots()
n=12
N=1000
cmap = plt.get_cmap('plasma_r', N,)
norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)


for i in range(n//2):
    ax.text(0.2, (n-i+0.4)/(n+1), f'{corr_tags[i][0]:.2f}', color='k', verticalalignment ='center', horizontalalignment ='left')
    ax.text(0.35, (n-i+0.4)/(n+1), f'{corr_tags[i][1:]}', color=cmap(norm(corr_tags[i][0])), verticalalignment ='center', horizontalalignment ='left',
            bbox=dict(facecolor='none', edgecolor=cmap(norm(corr_tags[i][0])), boxstyle='round'))
ax.text(0.35, 0.48, '°°°', color='k', verticalalignment ='center', horizontalalignment ='left', fontsize="xx-large")
for i in range(n//2, n):
    ax.text(0.2, (n-i-0.4)/(n+1), f'{corr_tags[i][0]:.2f}', color='k', verticalalignment ='center', horizontalalignment ='left')
    ax.text(0.35, (n-i-0.4)/(n+1), f'{corr_tags[i][1:]}', color=cmap(norm(corr_tags[i][0])), verticalalignment ='center', horizontalalignment ='left',
            bbox=dict(facecolor='none', edgecolor=cmap(norm(corr_tags[i][0])), boxstyle='round'))
plt.axis('off')


sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
plt.colorbar(sm, location='right')

plt.show()

## Raw title embedding

In [None]:
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
embeddings.shape

In [None]:
titles = df_50k["Title"].to_list()
titles[:10]

In [None]:
title_embeddings = model.encode(titles)
title_embeddings.shape

## Text processing and analysis

### Processing

In [None]:
def extract_text_code(body, max_length=1000):
    """Extracts the text and code of an html page
    """
    soup_to_txt = lambda s: s.getText()
    soup = BeautifulSoup(body)
    text_soup_l = soup.find_all(['h1', 'h2', 'h3', 'h4', 'p'])
    text_l = list(map(soup_to_txt, text_soup_l))
    text = '\n'.join(text_l).encode('ascii',errors='ignore').decode('ascii')
    text_cropped=len(text)>max_length

    code_soup_l = soup.find_all('code')
    code_l = list(map(soup_to_txt, code_soup_l))
    code = '\n'.join(code_l).encode('ascii',errors='ignore').decode('ascii')
    code_cropped=len(code)>max_length
    return text[:max_length], text_cropped, code[:max_length], code_cropped

In [None]:
print(df_50k["Body"][0])

In [None]:
print(extract_text_code(df_50k["Body"][0])[2])

In [None]:
text_codes_l = df_50k["Body"].apply(extract_text_code)

In [None]:
text_l, text_cropped_l, code_l, code_cropped_l = zip(*text_codes_l)

In [None]:
df_final = df_50k
df_final = df_final.drop(columns=["Body", "Id", "AcceptedAnswerId"])
df_final["BodyText"]= text_l
df_final["BodyCode"]= code_l
df_final["BodyTextCropped"]= text_cropped_l
df_final["BodyCodeCropped"]= code_cropped_l

In [None]:
df_final.head()

In [None]:
df_final.to_csv("df_cleaned.csv", index=False)

### Vocabulary analysis

In [None]:
N = 2000
rng = default_rng(seed=6)
numbers = rng.choice(len(df_final), size=N, replace=False)

In [None]:
def lemmatize(text):
   doc = nlp(text)
   tokens = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]
   return ' '.join(tokens)

In [None]:
bow = CountVectorizer(preprocessor = lemmatize)

In [None]:
bow_title = bow.fit_transform(df_final.loc[numbers,"Title"].to_list())
bow_title_voc = bow.vocabulary_
bow_body_text = bow.fit_transform(df_final.loc[numbers,"BodyText"].to_list())
bow_body_text_voc = bow.vocabulary_
bow_body_code = bow.fit_transform(df_final.loc[numbers,"BodyCode"].to_list())
bow_body_code_voc = bow.vocabulary_

In [None]:
print(bow_title.shape)
print(bow_body_text.shape)
print(bow_body_code.shape)

In [None]:
wordcloud = WordCloud(width= 1000, height = 600, max_words=100,
                      random_state=1, background_color='white', colormap='viridis_r',
                      collocations=False).generate_from_frequencies(bow_title_voc)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(width= 1000, height = 600, max_words=100,
                      random_state=1, background_color='white', colormap='viridis_r',
                      collocations=False).generate_from_frequencies(bow_body_text_voc)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(width= 1000, height = 600, max_words=100,
                      random_state=1, background_color='white', colormap='viridis_r',
                      collocations=False).generate_from_frequencies(bow_body_code_voc)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Dimension reduction

In [None]:
title_embeddings_sub = title_embeddings[numbers]

text_embeddings_sub = model.encode(df_final.loc[numbers,"BodyText"].to_list())
code_embeddings_sub = model.encode(df_final.loc[numbers,"BodyCode"].to_list())

In [None]:
def to_name(encoded_tag):
    for i in range(len(encoded_tag)-1,-1,-1):
        if encoded_tag[i]:
            return popular_tags[i]
    return "none"

pseudo_labels = list(map(to_name, encoded_tags[numbers,:10]))

### PCA

In [None]:
n_components=title_embeddings_sub.shape[1]
title_pca = PCA(n_components=n_components)
text_pca = PCA(n_components=n_components)
code_pca = PCA(n_components=n_components)

In [None]:
X_title_proj = title_pca.fit_transform(title_embeddings_sub)
X_text_proj = text_pca.fit_transform(text_embeddings_sub)
X_code_proj = code_pca.fit_transform(code_embeddings_sub)

In [None]:
scree = (title_pca.explained_variance_ratio_*100)
scree_cum = scree.cumsum()
x_list = range(1, n_components+1)
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("Inertia rank")
plt.ylabel("Inertia percentage")
plt.title("Eigen value cumulative graph")
plt.show(block=False)

In [None]:
scree = (text_pca.explained_variance_ratio_*100)
scree_cum = scree.cumsum()
x_list = range(1, n_components+1)
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("Inertia rank")
plt.ylabel("Inertia percentage")
plt.title("Eigen value cumulative graph")
plt.show(block=False)

In [None]:
scree = (code_pca.explained_variance_ratio_*100)
scree_cum = scree.cumsum()
x_list = range(1, n_components+1)
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("Inertia rank")
plt.ylabel("Inertia percentage")
plt.title("Eigen value cumulative graph")
plt.show(block=False)

In [None]:
sns.barplot(x=np.arange(n_components),y=code_pca.components_[0,:])
ax=plt.gca()
ax.set_xticks(np.arange(n_components,step=50))
plt.xlabel("feature")
plt.ylabel("value")
plt.title("Component 0 for code embedding")

In [None]:
x_y = (0,1)
helpers.plot.pca.display_factorial_planes(X_title_proj, x_y, clusters=pseudo_labels)

In [None]:
x_y = (2,3)
helpers.plot.pca.display_factorial_planes(X_title_proj, x_y, clusters=pseudo_labels)

In [None]:
x_y = (1,2)
helpers.plot.pca.display_factorial_planes(X_title_proj, x_y, clusters=pseudo_labels)

In [None]:
x_y = (0,1)
helpers.plot.pca.display_factorial_planes(X_text_proj, x_y, clusters=pseudo_labels)

In [None]:
x_y = (0,1)
helpers.plot.pca.display_factorial_planes(X_code_proj, x_y, clusters=pseudo_labels)

In [None]:
x_y = (2,3)
helpers.plot.pca.display_factorial_planes(X_code_proj, x_y, clusters=pseudo_labels)

In [None]:
x_y = (1,2)
helpers.plot.pca.display_factorial_planes(X_code_proj, x_y, clusters=pseudo_labels)

### t-SNE

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40.0, random_state=42)
z = tsne.fit_transform(title_embeddings_sub)

In [None]:
df = pd.DataFrame()
df["hue"] = pseudo_labels
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

sns.scatterplot(x="comp-1", y="comp-2", hue="hue",
                palette=sns.color_palette("hls", 11),
                data=df).set(title="T-SNE projection")

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40.0, random_state=42)
z = tsne.fit_transform(code_embeddings_sub)

In [None]:
df = pd.DataFrame()
df["hue"] = pseudo_labels
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

sns.scatterplot(x="comp-1", y="comp-2", hue="hue",
                palette=sns.color_palette("hls", 11),
                data=df).set(title="T-SNE projection")

In [None]:
perplexity = np.linspace(5,50,10, dtype=int)
divergence = []

for i in perplexity:
    model = TSNE(n_components=2, init="pca", perplexity=i, verbose=1)
    reduced = model.fit_transform(title_embeddings_sub)
    divergence.append(model.kl_divergence_)
fig = px.line(x=perplexity, y=divergence, markers=True)
fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
fig.update_traces(line_color="red", line_width=1)
fig.show()

In [None]:
plt.plot(perplexity, 2.*np.array(divergence)+np.log(N)*perplexity/N,c="red",marker='o')
ymin, ymax = plt.ylim()
plt.vlines(40, ymin=ymin, ymax=ymax)
plt.ylim(ymin, ymax )
plt.xlabel("Perplexity")
plt.ylabel("S criterion")
plt.show(block=False)

In [None]:
perplexity = np.linspace(5,50,10, dtype=int)
divergence = []

for i in perplexity:
    model = TSNE(n_components=2, init="pca", perplexity=i, verbose=1)
    reduced = model.fit_transform(code_embeddings_sub)
    divergence.append(model.kl_divergence_)
fig = px.line(x=perplexity, y=divergence, markers=True)
fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
fig.update_traces(line_color="red", line_width=1)
fig.show()

In [None]:
plt.plot(perplexity, 2.*np.array(divergence)+np.log(N)*perplexity/N,c="red",marker='o')
ymin, ymax = plt.ylim()
plt.vlines(40, ymin=ymin, ymax=ymax)
plt.ylim(ymin, ymax )
plt.xlabel("Perplexity")
plt.ylabel("S criterion")
plt.show(block=False)

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=5, random_state=42)
z = tsne.fit_transform(code_embeddings_sub)

In [None]:
df = pd.DataFrame()
df["hue"] = pseudo_labels
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

sns.scatterplot(x="comp-1", y="comp-2", hue="hue",
                palette=sns.color_palette("hls", 11),
                data=df).set(title="T-SNE projection")

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=1000, random_state=42)
z = tsne.fit_transform(code_embeddings_sub)

In [None]:
df = pd.DataFrame()
df["hue"] = pseudo_labels
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

sns.scatterplot(x="comp-1", y="comp-2", hue="hue",
                palette=sns.color_palette("hls", 11),
                data=df).set(title="T-SNE projection")

## Export

In [None]:
export_name = "title_embedding_50k_500t.pkl"

In [None]:
with open(export_name, "wb") as f:
    pickle.dump((title_embeddings, encoded_tags, popular_tags), f)