# Importing Librairies

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import nltk
from sklearn.decomposition import LatentDirichletAllocation
from math import ceil
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

# Importing datas

In [2]:
data_bagwords=pd.read_csv('bags_words.csv')
data_targets=pd.read_csv('target.csv')
data_tfidf=pd.read_csv('tfidf.csv')
del(data_bagwords['Unnamed: 0'])
del(data_targets['Unnamed: 0'])
del(data_tfidf['Unnamed: 0'])

# Unsupervised Method

In [3]:
training=data_bagwords.values

In [5]:
#Choice the best hyperparametrs of the LDA method.
params={'n_components':[10,50,80,150,250]}
lda=LatentDirichletAllocation()
model=GridSearchCV(lda,params)
model.fit(training)



GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_components': [10, 50, 80, 150, 250]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [6]:
#Best Model
best_lda_model=model.best_estimator_
#Best parameters
print("Best Model's Params : {}".format(model.best_params_))
#Log Likelihood Score
print("Best LOg likelihood score : {}".format(model.best_score_))
#Perplexity
print("Model perplexity : {}".format(best_lda_model.perplexity(training)))

Best Model's Params : {'n_components': 10}
Best LOg likelihood score : -544761.9040907667
Model perplexity : 2435.0213701508733


In [7]:
latent=best_lda_model.transform(training) # Topics distribution for data_bagwords.

In [8]:
# for each observation , we will choice the three moste representative topics.
tags=pd.DataFrame(columns=['topics']) # the dataframe of returned tags for each question.

In [9]:
#compute the representative topics for each sample.
tags['topics']=[latent[i,:].argsort()[-3:] for i in range(latent.shape[0])] 

In [10]:
#Frequency used word for each topic per each sample.
frequency_word=best_lda_model.components_

In [11]:
# topics illustration. 
def display_topics(component,features_names,no_top_words):
    """This function, allow to illustrate the topics by their most used words.
    
       Args:
       
       component(array): frequency of using of each words by each topics
       features_names(list): list of used words
       no_top_words(int): the number of most used words to consider for each topic.
       
       
    
    """
    for toop_idx,topc in enumerate(component):
        print ("Topic {} :".format(toop_idx))
        print (" ".join([features_names[i] for i in topc.argsort()[-no_top_words-1:-1] ]))
no_top_words=10
display_topics(frequency_word,data_bagwords.columns,no_top_words)

Topic 0 :
format wiki wikipedia blog number thread librari msdn microsoft en
Topic 1 :
request site session mail content header password email secur user
Topic 2 :
insert return end array queri index select list databas sql
Topic 3 :
wcf currentpag stackoverflow pattern os https regex python rubi book
Topic 4 :
user know chang thing control question think page report project
Topic 5 :
js jqueri javascript text id element tag div amp event
Topic 6 :
2gb testantlr blobout cc myapp getupperbound orig applet callback matrix
Topic 7 :
properti count null type void method valu static return public
Topic 8 :
acm directcast jdk lang stub javas union row sup sun
Topic 9 :
visual plugin eclips commit dim color subvers repositori merg branch


In [12]:
def find_tags(topics,no_words):
    """This function, allows to determinate the most used words for each topics.Theses words will returned by 
       the function as tags.
       
       Args:
       topics(list):the index of topics
       no_words(int):the number of most used words per topic.That the function should capture.
       
       Returns:
       
       words(str): the captured words or tags.
       """
    words=str()
    for i in topics:
        ind=frequency_word[i,:].argsort()[-no_words:]
        for j in ind:
             
             words+=data_bagwords.columns[j]
             words+=' '
           
    return words

In [13]:
no_words=2

In [14]:
tags['tag']=[find_tags(topics,no_words) for topics in tags['topics']]

# Unsupervised method performance

<font color=red> <font size=4> ** True tags**</font>

Hereunder , we will mesure the performance of the unsupervised method by the ratio between the true labels detected and the total number of the returned labels.

In [15]:
nb_tags=np.size(np.where(data_targets.values==1)[0])# the total number of true tags that should be predicted.

In [16]:
data_targets.columns[np.where(data_targets.iloc[1,:].values==1)[0]]

Index(['browser', 'user', 'html'], dtype='object')

In [17]:
score=0
for i in range(len(tags)):
    li=tags.iloc[i,1].split(' ')
    de=data_targets.columns[np.where(data_targets.iloc[i,:].values==1)[0]]
    
    for elt in li :
        if elt in de:
            score+=1

In [18]:
score=score/nb_tags*100

In [19]:
print('The performance of the unsupervised method on base to true returned tags equal to : {}%'.format(score))

The performance of the unsupervised method on base to true returned tags equal to : 2.5081788440567068%


<font color =red> <font size=4> ** Pertinent Tags **</font>

**Here,we will evaluate the performance of the unsupervised method by the following approach:  
    we consider each returned target as pertinent if he belong to the tags mentionned in the initial data.So we mesure
    the performance of the method by the number of pertinents tags in compare with the total returned tags.**

In [20]:
#We will compute hereunder the number of pertinent tags returned by the unsupervised method.
sc=0

for elt in tags['tag']:
    li=[]
    mots=elt.split(' ')
    for mt in mots:
        li.append(mt[:-1])
    for l in li:
        if l in data_targets.columns:
            sc+=1

In [21]:
nb=6*len(tags)
sc=(sc/nb_tags)*100 # score of the unsupervised  method.

In [22]:
print('The performance of the unsupervised method equal to:{}%'.format(sc))

The performance of the unsupervised method equal to:29.819459590451956%


# Supervised method

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

In [24]:
x=np.array(data_tfidf.values)
y=np.array(data_targets.values)

In [25]:
xtr,xts,ytr,yts=train_test_split(x,y,train_size=0.4)



In [26]:
estimator=MultiOutputClassifier(LogisticRegression())



In [27]:
estimator.fit(xtr,ytr)

MultiOutputClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
           n_jobs=1)

In [28]:
ypred=estimator.predict(xts)

In [29]:
y1=ypred.flatten()
y2=yts.flatten()

In [30]:
nb_total=np.size(np.where(y2==1)[0])


In [31]:
y_score=y1+y2

In [32]:
score=np.size(np.where(y_score==2)[0])

In [33]:
score=(score/nb_total)*100

In [34]:
print('The performance of the supervised method equal to:{} %'.format(score))

The performance of the supervised method equal to:6.734816596512328 %


# Export useful object

In [35]:
features=data_targets.columns

In [36]:
import os
path=os.path.abspath(os.path.dirname('__file__'))
chemin=os.path.join(path,'objets')

In [37]:
features=data_bagwords.columns
import pickle
with open (chemin,'wb') as fichier:
     mon_pickler=pickle.Pickler(fichier)
     mon_pickler.dump(best_lda_model)
     mon_pickler.dump(frequency_word)
     mon_pickler.dump(features)

In [38]:
texte="<p>I want to use a track-bar to change a form's opacity.</p>\n\n<p>This is my code:</p>\n\n<pre><code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code></pre>\n\n<p>When I build the application, it gives the following error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type <code>'decimal'</code> to <code>'double'</code>.</p>\n</blockquote>\n\n<p>I tried using <code>trans</code> and <code>double</code> but then the control doesn't work. This code worked fine in a past VB.NET project.</p>\n"

In [39]:
import pickle
fi=os.path.join(path,'utiles')
with open(fi,'rb') as fichier :
    depickler=pickle.Unpickler(fichier)
    mt=depickler.load()
    tf=depickler.load()