In [1]:
import pandas as pd
pd.options.display.max_columns=200
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.pipeline import make_pipeline
import bs4 as bs
import nltk
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
from wordcloud import WordCloud

In [18]:
data = pd.read_csv('data/QueryResults.csv')

In [19]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount
0,how do I calculate a rolling idxmax,<p>consider the <code>pd.Series</code> <code>s...,<python><pandas><numpy><dataframe><series>,40101130,9,7584,0.0,6
1,Better techniques for trimming leading zeros i...,"<p>I've been using <a href=""https://stackoverf...",<sql><sql-server><sql-server-2005><tsql><string>,662383,197,388401,0.0,19
2,"""No X11 DISPLAY variable"" - what does it mean?",<p>I am trying to install a Java application o...,<java><linux><variables><x11><headless>,662421,122,622820,0.0,12
3,Object-Oriented Callbacks for C++?,<p>Is there some library that allows me to eas...,<c++><oop><callback><pointer-to-member><eiffel>,3520133,9,10228,0.0,6
4,Why doesn't .NET find the OpenSSL.NET dll?,"<p><strong>EDIT</strong> (the whole question, ...",<c#><.net><dll><dllimport><dllnotfoundexception>,1396164,9,18554,0.0,12


In [20]:
data.set_index('Id', inplace=True)

In [21]:
data['Text'] = data['Title'] + ' ' + data['Body']

In [22]:
data.drop(['Score','ViewCount','FavoriteCount','AnswerCount','Title','Body'], axis=1, inplace=True)

In [24]:
#delete balises html with beautiful soup
def clean_html(text):
    soup = bs.BeautifulSoup(text, "html.parser")
    return soup.get_text()

data['Text'] = data['Text'].apply(clean_html)

In [25]:
data.head()

Unnamed: 0_level_0,Tags,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
40101130,<python><pandas><numpy><dataframe><series>,how do I calculate a rolling idxmax consider t...
662383,<sql><sql-server><sql-server-2005><tsql><string>,Better techniques for trimming leading zeros i...
662421,<java><linux><variables><x11><headless>,"""No X11 DISPLAY variable"" - what does it mean?..."
3520133,<c++><oop><callback><pointer-to-member><eiffel>,Object-Oriented Callbacks for C++? Is there so...
1396164,<c#><.net><dll><dllimport><dllnotfoundexception>,Why doesn't .NET find the OpenSSL.NET dll? EDI...


In [26]:
def tokenizer_fct(sentence) :
    # print(sentence)
    sentence_clean = sentence.replace('-', ' ').replace('+', ' ').replace('/', ' ').replace('#', ' ')
    word_tokens = word_tokenize(sentence_clean)
    return word_tokens

# Stop words
stop_w = list(set(stopwords.words('english'))) + ['[', ']', ',', '.', ':', '?', '(', ')', "'", '"', '!', ';', '``', "''", '...', '’', '“', '”']

def stop_word_filter_fct(list_words) :
    filtered_w = [w for w in list_words if not w in stop_w]
    filtered_w2 = [w for w in filtered_w if len(w) > 2]
    return filtered_w2

# lower case et alpha
def lower_start_fct(list_words) :
    lw = [w.lower() for w in list_words if (not w.startswith("@")) 
                                       and (not w.startswith("#"))
                                       and (not w.startswith("http"))]
    return lw

# Lemmatizer (base d'un mot)
def lemma_fct(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w

# Fonction de préparation du texte pour le bag of words avec lemmatization
def transform_bow_lem_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
    lem_w = lemma_fct(lw)    
    transf_desc_text = ' '.join(lem_w)
    return transf_desc_text

In [27]:
data['Text'] = data['Text'].apply(transform_bow_lem_fct)

In [28]:
data.head()

Unnamed: 0_level_0,Tags,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
40101130,<python><pandas><numpy><dataframe><series>,calculate rolling idxmax consider pd.series im...
662383,<sql><sql-server><sql-server-2005><tsql><string>,better technique trimming leading zero sql ser...
662421,<java><linux><variables><x11><headless>,x11 display variable mean trying install java ...
3520133,<c++><oop><callback><pointer-to-member><eiffel>,object oriented callback library allows easily...
1396164,<c#><.net><dll><dllimport><dllnotfoundexception>,why n't .net find openssl.net dll edit whole q...


In [29]:
import re

# Suppression des chevrons et extraction des tags
data['Tags'] = data['Tags'].apply(lambda x: re.findall(r'<([^>]+)>', x))

data.head()

Unnamed: 0_level_0,Tags,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
40101130,"[python, pandas, numpy, dataframe, series]",calculate rolling idxmax consider pd.series im...
662383,"[sql, sql-server, sql-server-2005, tsql, string]",better technique trimming leading zero sql ser...
662421,"[java, linux, variables, x11, headless]",x11 display variable mean trying install java ...
3520133,"[c++, oop, callback, pointer-to-member, eiffel]",object oriented callback library allows easily...
1396164,"[c#, .net, dll, dllimport, dllnotfoundexception]",why n't .net find openssl.net dll edit whole q...


In [31]:
data['Tags']

Id
40101130           [python, pandas, numpy, dataframe, series]
662383       [sql, sql-server, sql-server-2005, tsql, string]
662421                [java, linux, variables, x11, headless]
3520133       [c++, oop, callback, pointer-to-member, eiffel]
1396164      [c#, .net, dll, dllimport, dllnotfoundexception]
                                  ...                        
3915827               [c#, .net, clr, function-pointers, cil]
71419635               [ios, swift, webrtc, ios15, videocall]
53180817    [graphql, apollo, react-apollo, apollo-client,...
53180854    [node.js, websocket, socket.io, apollo-server,...
Name: Tags, Length: 50000, dtype: object

In [33]:
from collections import Counter

# Compter les tags
tags_counter = Counter()
for tags in data['Tags']:
    tags_counter.update(tags)

# Nombre de tags différents
print("Nombre de tags différents :", len(tags_counter))

# Obtenir les 100 tags les plus fréquents
top_100_tags = tags_counter.most_common(100)

Nombre de tags différents : 18315


In [35]:
top_100_tags_set = set(tag for tag, _ in top_100_tags)

# Fonction pour ne garder que les tags du top 100
def keep_top_100_tags(tags):
    return [tag for tag in tags if tag in top_100_tags_set]

# Appliquer la fonction 
data['Tags'] = data['Tags'].apply(keep_top_100_tags)

# Filtrer les données pour ne garder que les questions ayant au moins un tag dans le top 100
data = data[data['Tags'].apply(lambda x: len(x) > 0)].reset_index(drop=True)

# Afficher le nombre de questions restantes
print("Nombre de questions restantes :", len(data))

Nombre de questions restantes : 46749


In [36]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
tags = mlb.fit_transform(data['Tags'])

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Appliquer les meilleurs paramètres pour TfidfVectorizer
tfidf = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english')

#Transformer le texte en vecteurs
text = tfidf.fit_transform(data['Text'])

In [46]:
text.shape

(46749, 269477)

In [47]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# Appliquer la régression logistique avec OneVsRestClassifier
clf = OneVsRestClassifier(LogisticRegression(C=10, penalty='l1', solver='liblinear'))
clf.fit(text, tags)

In [50]:
#Sauvegarder le modèle
import pickle

pickle.dump(clf, open('model/model.pkl','wb'))