# Importing Librairies

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import nltk

# Importing data

In [2]:
data=pd.read_csv('QueryResults .csv')

In [3]:
len(data) # the number of extracted questions

27182

In [4]:
data.head() # preview data

Unnamed: 0,Id,Body,Tags
0,4,<p>I want to use a track-bar to change a form'...,<c#><floating-point><type-conversion><double><...
1,6,<p>I have an absolutely positioned <code>div</...,<html><css><css3><internet-explorer-7>
2,7,<p>An explicit cast to double like this isn't ...,
3,9,<p>Given a <code>DateTime</code> representing ...,<c#><.net><datetime>
4,11,<p>Given a specific <code>DateTime</code> valu...,<c#><datetime><time><datediff><relative-time-s...


# Text cleaning

In [5]:
from collections import defaultdict

In [6]:
cor_avant=defaultdict(list)
for i in data.index:
    cor_avant[data.iloc[i,0]]=nltk.word_tokenize(data.iloc[i,1])
frequency={}
for key,val in cor_avant.items():
    frequency[key]=nltk.FreqDist(val)
counter_av=nltk.Counter()
for key,val in cor_avant.items():
    counter_av+=frequency[key]
nb_mot_initial=len(counter_av)

In [7]:
tokenizer=nltk.RegexpTokenizer(r'\w+') 

In [8]:
corpora=defaultdict(list) # world list used for each id

In [9]:
for i in data.index:
    corpora[data.iloc[i,0]]=tokenizer.tokenize(data.iloc[i,1].lower())

In [10]:
# illustration in the first question of our data set.
texte=data.iloc[0,1]
texte

"<p>I want to use a track-bar to change a form's opacity.</p>\n\n<p>This is my code:</p>\n\n<pre><code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code></pre>\n\n<p>When I build the application, it gives the following error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type <code>'decimal'</code> to <code>'double'</code>.</p>\n</blockquote>\n\n<p>I tried using <code>trans</code> and <code>double</code> but then the control doesn't work. This code worked fine in a past VB.NET project.</p>\n"

In [11]:
# the first texte after tokenization
tock_texte=corpora[4]
tock_texte=(',').join(tock_texte)
tock_texte

'p,i,want,to,use,a,track,bar,to,change,a,form,s,opacity,p,p,this,is,my,code,p,pre,code,decimal,trans,trackbar1,value,5000,this,opacity,trans,code,pre,p,when,i,build,the,application,it,gives,the,following,error,p,blockquote,p,cannot,implicitly,convert,type,code,decimal,code,to,code,double,code,p,blockquote,p,i,tried,using,code,trans,code,and,code,double,code,but,then,the,control,doesn,t,work,this,code,worked,fine,in,a,past,vb,net,project,p'

In [12]:
from nltk.stem.snowball import EnglishStemmer
stemmer=EnglishStemmer()

In [13]:
#Stemming worlds.
for key,val in corpora.items():
    corpora[key]=[stemmer.stem(w) for w in val]

In [14]:
# the first texte after the Stemming operation.
stemm_texte=corpora[4]
stemm_texte=(',').join(stemm_texte)
stemm_texte

'p,i,want,to,use,a,track,bar,to,chang,a,form,s,opac,p,p,this,is,my,code,p,pre,code,decim,tran,trackbar1,valu,5000,this,opac,tran,code,pre,p,when,i,build,the,applic,it,give,the,follow,error,p,blockquot,p,cannot,implicit,convert,type,code,decim,code,to,code,doubl,code,p,blockquot,p,i,tri,use,code,tran,code,and,code,doubl,code,but,then,the,control,doesn,t,work,this,code,work,fine,in,a,past,vb,net,project,p'

In [15]:
#bag of world
freq=dict()
for key,val in corpora.items():
    freq[key]=nltk.FreqDist(val)

In [16]:
#stop of words 
counter=nltk.Counter()
for k,v in corpora.items():
    counter+=freq[k]

In [17]:
#We will classify the 100 most used words as stop of word.
most_freq=[a[0] for a in counter.most_common(100)]

In [18]:
#We will create a set which cumulate the 100 most used words and the stop words list belong to the library NLTK.
sw=set() 
sw.update(most_freq)
sw.update(nltk.corpus.stopwords.words('english'))

In [19]:
#delete stop of words from our corpus.
for k,v in corpora.items():
    corpora[k]=[w for w in v if w not in list(sw) ]

In [20]:
# The first texte after deleting stop of words.
stop_texte=corpora[4]
stop_texte=(',').join(stop_texte)
stop_texte

'track,bar,chang,form,opac,decim,tran,trackbar1,valu,5000,opac,tran,build,give,follow,error,blockquot,cannot,implicit,convert,type,decim,doubl,blockquot,tri,tran,doubl,control,fine,past,vb,project'

In [21]:
#Recomputing the statistics of bag of words after deleting the stop of words.
for k,v in corpora.items():
    freq[k]=nltk.FreqDist(v)

In [22]:
#illustration of the bag of words for the choiced texte.
bag_texte=freq[4]
bag_texte

FreqDist({'tran': 3, 'opac': 2, 'decim': 2, 'blockquot': 2, 'doubl': 2, 'track': 1, 'bar': 1, 'chang': 1, 'form': 1, 'trackbar1': 1, ...})

In [23]:
#Comparaison to the number of terms used before and after cleaning.
count_ap=nltk.Counter()
for k,v in freq.items():
    count_ap+=freq[k]
nb_mot_apres=len(count_ap)

In [24]:
comparatives=pd.DataFrame({'Avant nettoyage ':[nb_mot_initial],'Apres nettoyage ':[nb_mot_apres]},index=['Vocabulaires'])
comparatives

Unnamed: 0,Avant nettoyage,Apres nettoyage
Vocabulaires,116108,53966


# Data based on bags of words

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
cor_val=list(corpora.values())
cor_user=[]
for elt in cor_val:
    cor_user.append(" ".join(elt))

In [27]:
# Function allow to compute the frequency of used words per document.
sk=CountVectorizer(max_df=0.95,min_df=2,lowercase=False,stop_words='english').fit(cor_user)

In [28]:
cor=sk.transform(cor_user)

In [29]:
cor=cor.toarray()

# Data based on TF-IDF

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
#Function allow to compute the tf-idf for each used word per document.
Tfidf=TfidfVectorizer(max_df=0.95,min_df=2,lowercase=False,stop_words='english').fit(cor_user)

In [32]:
cor_tfidf=Tfidf.transform(cor_user)

# Target data

In [33]:
#define the list index where Tags values  are missing.
ind_missing=np.where(data['Tags'].isna())

In [37]:
#Delete all rows from defined dataframe where rows Tags are missing.
cor=cor[~ind_missing[0],:]


In [39]:
#Delete all rows from defined dataframe where rows Tags are missing.
cor_tfidf=cor_tfidf[~ind_missing[0],:]

In [40]:
 tags=data['Tags'].dropna()

In [41]:
cor_tag=[]
for elt in tags:
    texte=str()
    for ind in elt:
        texte+=ind
        if ind=='>':
            texte+=" "
    cor_tag.append(texte)    

In [42]:
mk=CountVectorizer(lowercase=False)

In [43]:
cor_target=mk.fit_transform(cor_tag)

In [44]:
cor_target=cor_target.toarray()

In [45]:
nb_q=len(cor_target)
app_mot=[]
for i in range(cor_target.shape[1]):
    app_mot.append(cor_target[:,i].sum()/nb_q)

In [46]:
index=np.argsort(app_mot)


In [47]:
selected_target=[100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,2425]
question=[]
for i in selected_target:
    a=len(np.where(cor_target[:,index[-i-1:-1]].any(axis=1))[0])
    question.append(a)

In [48]:
question=np.array(question)/cor_target.shape[0]
question

array([0.81576552, 0.89204442, 0.926998  , 0.94301839, 0.95357728,
       0.95940288, 0.96431822, 0.96868742, 0.97232842, 0.97542327,
       0.97687967, 0.97778991, 0.97924631, 0.98015656, 0.98106681,
       0.98798471])

In [49]:
nb_tags=100 # we choice the number tags which allow to maintain 80% of question .

In [50]:
cor_target.shape[1]# total number of tags 

2425

In [51]:
tar=cor_target[:,index[-101:-1]] # we save only the choiced target

In [52]:
tar=tar[np.where(tar.any(axis=1))[0],:] # we save only the question which have tagets.

In [53]:
tar.shape

(4481, 100)

In [54]:
col=[]
for a in index[-101:-1]:
    col.append(mk.get_feature_names()[a])

In [55]:
target=pd.DataFrame(tar,columns=col)

In [56]:
df_tfidf=cor_tfidf[np.where(tar.any(axis=1))[0],:]

In [57]:
df_tfidf=pd.DataFrame(df_tfidf.toarray(),columns=Tfidf.get_feature_names())

In [58]:
cor=cor[np.where(tar.any(axis=1))[0],:]

In [59]:
df=pd.DataFrame(cor,columns=sk.get_feature_names())

# Exporting datas

In [60]:
df.to_csv('bags_words.csv')
df_tfidf.to_csv('tfidf.csv')
target.to_csv('target.csv')

# Exporting objects

In [61]:
import pickle
import os

In [62]:
path=os.path.abspath(os.path.dirname('__file__'))
chemin=os.path.join(path,'utiles')

In [63]:
with open(chemin,'wb') as fichier:
    mon_pickler=pickle.Pickler(fichier)
    mon_pickler.dump(most_freq)
    mon_pickler.dump(sk)