In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

# Catégorisez automatiquement des questions - Stack Overflow

#### Objectif: 
Développement d'un système de suggestion de tags via une API pour facilité la recherche aux utilisateurs sur le site Stack Overflow. Celui-ci prendra la forme d’un algorithme de machine learning qui assignera automatiquement plusieurs tags pertinents à une question.


Dans ce notebook:

- Récupération des données sur le site StackExchange
- Nettoyage du texte
- Vectorisation

### Sommaire

* [Import](#chapter1)
    * [Fichiers](#section_1_1)
    * [Librairies](#section_1_2)    
    * [Nos Données](#section_1_3)

* [Text Cleaning](#chapter2)
    * [Lower case](#section_2_1)
    * [Noise Removal](#section_2_2) 
    * [Punctuation](#section_2_3)

* [Preprocess data](#chapter3)
    * [Tokenization](#section_3_1)
    * [Remove stopwords](#section_3_2) 
    * [Lemmatization](#section__3)
    
* [Vectorization](#chapter3)

## Import <a class="anchor" id="chapter1"></a>
### Import des fichiers <a class="anchor" id="section_1_1"></a>

#### Récupération des données sur le site StackExchange

Remarque: On peut charger au maximum 50 000 entrées/ requête SQL
Période: 2020-01-01 à 2022-09-30

#### Code SQL utilisé
SELECT TOP 500000 CreationDate,Title, Body, Tags, Id, Score, ViewCount, FavoriteCount, AnswerCount
FROM Posts 
WHERE PostTypeId = 1 AND ViewCount > 10 AND FavoriteCount > 10
AND Score > 5 AND AnswerCount > 0 AND LEN(Tags) - LEN(REPLACE(Tags, '<','')) >= 5

### Import des librairies <a class="anchor" id="section_1_2"></a>

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
import IPython.display

import seaborn as sns 
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import textblob 
from wordcloud import WordCloud

import requests 
import re

from collections import Counter

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.tokenize import WordPunctTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import CountVectorizer


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 300)

#### Récuperation des fichiers de StackExchange

In [None]:
data = pd.read_csv('data.csv')

In [None]:
data.shape

In [None]:
data.head(1)

In [None]:
data.describe()

In [None]:
data["Tags"].value_counts(ascending=False).loc[lambda x : x>100]

In [None]:
data["Tags"].value_counts().to_frame()

In [None]:
data["Tags"].nunique()

## Text Cleaning <a class="anchor" id="chapter2"></a>

In [None]:
data.duplicated().sum()

In [None]:
data['CreationDate'] = pd.to_datetime(data['CreationDate'])

In [None]:
# On regroupe les données en annuel
year = data.groupby(pd.Grouper(key='CreationDate',
                                    freq='1Y')).agg({'Title': 'count'})

# Evolution au fur et a mesure des années
fig = plt.figure(figsize=(15,6))
sns.lineplot(data=year, x=year.index, y='Title')
plt.axhline(year.Title.mean(), 
            color="r", linestyle='--',
            label="Mean of question per year : {:04d}"\
                   .format(int(year.Title.mean())))
plt.xlabel("Date of questions")
plt.ylabel("Number of questions")
plt.title("Evolution du nombre de postes de 2009 to 2022",
          fontsize=18, color="#641E16")
plt.legend()
plt.savefig("Evolution du nombre de postes",transparent=True)
plt.show()

In [None]:
# Longeur des titres

fig = plt.figure(figsize=(20, 12))
ax = sns.countplot(x=data.Title.str.len())
start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(0, end, 5))
plt.axvline(data.Title.str.len().median() - data.Title.str.len().min(),
            color="r", linestyle='--',
            label="Title Lenght median : "+str(data.Title.str.len().median()))
ax.set_xlabel("Lenght of title")
plt.title("Title lenght of Stackoverflow questions",
          fontsize=18, color="#641E16")
plt.legend()
plt.show()

In [None]:
# Discretizer for Body characters lenght
X = pd.DataFrame(data.Body.str.len())

# Sklearn discretizer with 200 bins
from sklearn.preprocessing import KBinsDiscretizer
discretizer = KBinsDiscretizer(n_bins=200,
                               encode='ordinal',
                               strategy='uniform')
body_lenght = discretizer.fit_transform(X)
body_lenght = discretizer.inverse_transform(body_lenght)
body_lenght = pd.Series(body_lenght.reshape(-1))


In [None]:
fig = plt.figure(figsize=(20, 12))
ax = sns.countplot(x=body_lenght)
start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(0, end, 25))
ax.set_xlabel("Lenght of Body (after discretization)")
plt.title("Body lenght of Stackoverflow questions",
          fontsize=18, color="#641E16")
plt.legend()
plt.show()

In [None]:
# Nous allons d'abord concatener le body et le titre

data['Text'] = data['Title'] + data['Body']
#data = data.drop(['Title','Body'], axis=1)
data.head(1)

In [None]:
# Discretizer for Body characters lenght
X = pd.DataFrame(data.Text.str.len())

# Sklearn discretizer with 200 bins
from sklearn.preprocessing import KBinsDiscretizer
discretizer = KBinsDiscretizer(n_bins=200,
                               encode='ordinal',
                               strategy='uniform')
body_lenght = discretizer.fit_transform(X)
body_lenght = discretizer.inverse_transform(body_lenght)
body_lenght = pd.Series(body_lenght.reshape(-1))

fig = plt.figure(figsize=(20, 12))
ax = sns.countplot(x=body_lenght)
start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(0, end, 25))
ax.set_xlabel("Lenght of Body (after discretization)")
plt.title("Body lenght of Stackoverflow questions",
          fontsize=18, color="#641E16")
plt.legend()
plt.show()

## Nettoyage du texte

### Lower Case

In [None]:
data['Text_lower']= data['Text'].apply(lambda x: x.lower())

In [None]:
# Function to remove code
def remove_code(html):
  
    # parse html content
    soup = BeautifulSoup(html, "html.parser")
  
    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()
  
    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

In [None]:
data['Text_BeautifulSoup']=data['Text_lower'].apply(remove_code)

In [None]:
data.head(1)

### Regex

In [None]:
# Substitution de hashtags, '@', html, ponctuation
def regex(text):
    
    # Remove html tags
    text=re.sub(r"<[^>]*>",' ', str(text))
    # Remove usernames "@"
    text=re.sub(r'@\S+', ' ', text)
    # Remove hashtags
    text=re.sub(r'#\S+', ' ', text)
    # Remove punctuation
    text=re.sub(r'[^\w\s]', ' ', text)
    # Remove unicode characters
    text = text.encode("ascii", "ignore").decode()
    # Remove unicode characters
    text = text.encode("ascii", "ignore").decode()
    # Remove English contractions
    text = re.sub("\'\w+", '', text)
    # Remove extra spaces
    text = re.sub('\s+', ' ', text)
    # Remove numbers
    text = re.sub(r'\w*\d+\w*', '', text)
    # Remove links
    text = re.sub(r'http*\S+', '', text)
    # Remove punctuation
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    # Remove irrelevant characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Remove whitespace
    text=re.sub(r'^\s*|\s\s*', ' ', text).strip()
    text=text.replace('\n', ' ')

    return text

In [None]:
data['Text_regex'] = data['Text_BeautifulSoup'].apply(regex)

In [None]:
data['Text_regex'][13690]

In [None]:
#def tag_clean(text):
 #   text=text.replace('<','')
  #  text=text.replace('>','')
    
   # return text

In [None]:
#data['Tags']=data['Tags'].apply(tag_clean)


## Preprocessing <a class="anchor" id="chapter3"></a>
### Spelling correction <a class="anchor" id="section_3_1"></a>

Nous supposons que dans le forum,blog il y a souvent les erreurs de frappe ou 

In [None]:
pip install -U textblob

In [None]:
from textblob import TextBlob

In [None]:
def corr(text):

  text=TextBlob(text).correct()

  return text

### Tokenization <a class="anchor" id="section_3_2"></a>

In [None]:
data['Text_tokenized']=data['Text_regex'].apply(word_tokenize)

### Stopwords <a class="anchor" id="section_3_3"></a>

In [None]:
def remove_english_stopwords_func(text):
    '''
    Removes Stop Words (also capitalized) from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without Stop Words
    ''' 
    # check in lowercase 
    t = [token for token in text if token.lower() not in stopwords.words("english")]
    text = ' '.join(t)    
    return text

In [None]:
stopwords = set(stopwords.words('english'))
data['Text_stopwords'] = data['Text_tokenized'].apply(lambda text: [token for token in text if token not in stopwords])

### Lemmatization <a class="anchor" id="section_3_4"></a>

In [None]:
def get_pos_tags(word):
    
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, #adjective
                "N": wordnet.NOUN,#noun
                "V": wordnet.VERB,#verb
                "R": wordnet.ADV} #adverb

    return tag_dict.get(tag, wordnet.NOUN)

def lemma_fct(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w, get_pos_tags(w)) for w in list_words]
    return lem_w

In [None]:
data['Text_lemmatized']=data['Text_stopwords'].apply(lemma_fct)

In [None]:
def remove_single_char_func(text, threshold=1):
    threshold = threshold
    words = word_tokenize(text)
    text = ' '.join([w for w in words if len(w) > threshold])
    return text

In [None]:
data['Cleaned_text'] = data['Text_lemmatized'].apply(lambda x: ' '.join(x))

In [None]:
data['Cleaned_text']= data['Cleaned_text'].apply(remove_single_char_func)

In [None]:
# Calculate lenght of each list in Body
#data['body_tokens_count'] = [len(_) for _ in data.Cleaned_text]
data['body_tokens_count'] = data['length_cleaned_text'] 
# Countplot of body lenght
fig = plt.figure(figsize=(20, 12))
ax = sns.countplot(x=data.body_tokens_count)
start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(0, end, 25))
plot_median = data.body_tokens_count.median()
plt.axvline(plot_median - data.body_tokens_count.min(),
            color="r", linestyle='--',
            label="Body tokens Lenght median : "+str(plot_median))
ax.set_xlabel("Lenght of body tokens")
plt.title("Body tokens lenght of Stackoverflow questions after cleaning",
          fontsize=18, color="#641E16")
plt.legend()
plt.show()

In [None]:
data.body_tokens_count.median()

In [None]:
data.body_tokens_count.quantile([0.25,0.5,0.75])

In [None]:
data.body_tokens_count.quantile(0.95)

In [None]:
data.body_tokens_count.max()

In [None]:
data.shape

In [None]:
data=data[data.body_tokens_count > data.body_tokens_count.quantile(0.95)]
data.shape

In [None]:
data['length_cleaned_text'] = data['Cleaned_text'].apply(lambda x : len(word_tokenize(str(x))))
max_length_cleaned_data = data['length_cleaned_text'].max()
print("max length Cleaned_text : ", max_length_cleaned_data)

In [None]:
# Filter data on body lenght
data = data[data.Body.str.len() < 4000]
data.shape

In [None]:
data.head(1)

In [None]:
data['Cleaned_text'] = data['Cleaned_text'].astype("string")

In [None]:
data['Cleaned_text'].dtype

In [None]:
data.head(0)

In [None]:
data[data['length_cleaned_text']==3090]

In [None]:
data['Cleaned_text'][13690]

In [None]:
# Count words
def word_count_func(text):
    return len(text.split())

### Les tags

In [None]:
data["Tags"].nunique()

In [None]:
# Replace open and close balise between tags
data['Tags'] = data['Tags'].str.translate(str.maketrans({'<': '', '>': ','}))

# Delete last "," for each row
data['Tags'] = data['Tags'].str[:-1]
data['Tags'].head(3)

In [None]:
def count_split_tags(df, column, separator):
    """This function allows you to split the different words contained
    in a Pandas Series cell and to inject them separately into a list.
    This makes it possible, for example, to count the occurrences of words.

    Parameters
    ----------------------------------------
    df : Pandas Dataframe
        Dataframe to use.
    column : string
        Column of the dataframe to use
    separator : string
        Separator character for str.split.
    ----------------------------------------
    """
    list_words = []
    for word in df[column].str.split(separator):
        list_words.extend(word)
    df_list_words = pd.DataFrame(list_words, columns=["Tag"])
    df_list_words = df_list_words.groupby("Tag")\
        .agg(tag_count=pd.NamedAgg(column="Tag", aggfunc="count"))
    df_list_words.sort_values("tag_count", ascending=False, inplace=True)
    return df_list_words

In [None]:
tags_list = count_split_tags(df=data, column='Tags', separator=',')
print("Le jeu de données compte {} tags.".format(tags_list.shape[0]))

In [None]:
# Plot the results of splits
fig = plt.figure(figsize=(15, 8))
sns.barplot(data=tags_list.iloc[0:40, :],
            x=tags_list.iloc[0:40, :].index,
            y="tag_count", color="#f48023")
plt.xticks(rotation=90)
plt.title("40 most popular tags in Stackoverflow (2009 - 2020)",
          fontsize=18, color="#641E16")
plt.show()

In [None]:
# Plot word cloud with tags_list (frequencies)
fig = plt.figure(1, figsize=(17, 12))
ax = fig.add_subplot(1, 1, 1)
wordcloud = WordCloud(width=900, height=500,
                      background_color="black",
                      max_words=500, relative_scaling=1,
                      normalize_plurals=False)\
    .generate_from_frequencies(tags_list.to_dict()['tag_count'])

ax.imshow(wordcloud, interpolation='bilinear')
ax.axis("off")
plt.title("Word Cloud of 500 best Tags on StackOverflow (2009 - 2020)\n",
          fontsize=18, color="#641E16")
plt.show()

Il est intéressant de voir si les tags populaires ont évolué au cours du temps.

In [None]:
# Subplots parameters
years = {0: 2009, 1: 2012, 2: 2018, 3: 2022}
colors = {0: "#ff5858", 1: "#dba0db",
          2: "#72d9f0", 3: "#a5cf27"}
subplots = 4
cols = 2
rows = subplots // cols
rows += subplots % cols
position = range(1, subplots + 1)

# Plot popular tags for each year
fig = plt.figure(1, figsize=(20, 16))
for k in range(subplots):
    subset = data[data["CreationDate"].dt.year == years[k]]
    temp_list = count_split_tags(df=subset, column='Tags', separator=',')
    ax = fig.add_subplot(rows, cols, position[k])
    sns.barplot(data=temp_list.iloc[0:20, :],
            x=temp_list.iloc[0:20, :].index,
            y="tag_count", color=colors[k])
    plt.xticks(rotation=90)
    ax.set_title("20 most popular tags for {}".format(years[k]),
                 fontsize=18, color="#fcc642")

fig.tight_layout()
plt.show()

In [None]:
# Nombre de Tags par question :

# Create a list of Tags and count the number
data['Tags_list'] = data['Tags'].str.split(',')
data['Tags_count'] = data['Tags_list'].apply(lambda x: len(x))

# Plot the result
fig = plt.figure(figsize=(12, 8))
ax = sns.countplot(x=data.Tags_count, color="#f48023")
ax.set_xlabel("Tags")
plt.title("Number of tags used per question",
          fontsize=18, color="#fcc642")
plt.show()

Comme nous avons plus de 27 000 tags et que les algorithmes NLP sont assez lents, nous allons filtrer nos données. Nous allons prendre seulement les sujets qui sont dans les 50 meilleurs tags et supprimer le reste.

In [None]:
def filter_tag(x, top_list):
    """Comparison of the elements of 2 lists to 
    check if all the tags are found in a list of top tags.

    Parameters
    ----------------------------------------
    x : list
        List of tags to test.
    ----------------------------------------
    """
    temp_list = []
    for item in x:
        if (item in top_list):
            #x.remove(item)
            temp_list.append(item)
    return temp_list

In [None]:
top_tags = list(tags_list.iloc[0:50].index)
data['Tags_list'] = data['Tags_list']\
                    .apply(lambda x: filter_tag(x, top_tags))
data['number_of_tags'] = data['Tags_list'].apply(lambda x : len(x))
data = data[data.number_of_tags > 0]
print("New size of dataset : {} questions.".format(data.shape[0]))

In [None]:
data.head(1)

In [None]:
def tag_clean(text):
    text=text.replace('[','')
    text=text.replace(']','')
    return text

In [None]:
data['top_tags'] =data['Tags_list'].apply(lambda x:', '.join(x))

In [None]:
data.head(1)

In [None]:
data['Cleaned_text'][12]

In [None]:
data = data.drop(['CreationDate',
                  'Title',
                  'Id',
                  'Score',
                  'ViewCount',
                  'FavoriteCount',
                  'AnswerCount',
                  'Text_lower',
                  'Text_BeautifulSoup',
                  'Text_regex',
                 'Text_tokenized',
                 'Text_stopwords',
                 'Text_lemmatized',
                 'Tags_list',
                  'Tags_count',
                  'number_of_tags'], 
                  axis=1)

In [None]:
# Export to CSV
data.to_csv("processed_data.csv")

### Wordcloud vizualisation

In [None]:
# Instantiate a new wordcloud.
wordcloud_precleaning = WordCloud(random_state = 8,
        normalize_plurals = False,
        width = 600, height= 300,
        max_words = 300,
        stopwords = [])

text_brut = ' '.join(data['Text'])
# Apply the wordcloud to the text.
wordcloud_precleaning.generate(text_brut)

In [None]:
# Instantiate a new wordcloud.
wordcloud_cleaned_text = WordCloud(random_state = 8,
        normalize_plurals = False,
        width = 600, height= 300,
        max_words = 300,
        stopwords = [])

text_cleaned = ' '.join(data['Cleaned_text'])
# Apply the wordcloud to the text.
wordcloud_cleaned_text.generate(text_cleaned)

In [None]:
plt.figure()
fig, ax = plt.subplots(1,2, figsize = (20,10))

plt.subplot(211)
plt.imshow(wordcloud_precleaning, interpolation='bilinear')
plt.title("Avant nettoyage")
plt.axis("off")

plt.subplot(212)
plt.imshow(wordcloud_cleaned_text, interpolation='bilinear')
plt.title("Aprés nettoyage",)
plt.axis("off")

In [None]:
def tag_clean(text):
    text=text.replace('<','')
    text=text.replace('>',' ')
    
    return text

In [None]:
wordcloud_tag=WordCloud()
tags=' '.join(data['Tags'].apply(tag_clean))

wordcloud_tag.generate(tags)

In [None]:
plt.imshow(wordcloud_tag, interpolation='bilinear')
plt.axis("off")

##  Vectorization <a class="anchor" id="chapter4"></a>

In [None]:
data['Text_toVectorize'] = data['Cleaned_text'].apply(lambda x: ' '.join(x))

In [None]:
data.head(2)

In [None]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(data['Text_toVectorize'])

X.todense()