## Imports

In [1]:
import nltk

from nltk import pos_tag
from nltk import sent_tokenize
from nltk import WordNetLemmatizer
from nltk import wordpunct_tokenize

from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords as sw

from nltk.tokenize import word_tokenize


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('average_perceptron_tagger')

stopwords = sw.words('english')


[nltk_data] Downloading package stopwords to /home/belr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/belr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/belr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading average_perceptron_tagger: Package
[nltk_data]     'average_perceptron_tagger' not found in index


## MongoDB

In [2]:
from pymongo import MongoClient

# uri = "mongodb://%s:%s@%s" % ('root', 'belr', 'localhost')
uri = "mongodb://root:belr@localhost:27017"
client = MongoClient(uri)

db = client["DiscordSmartbotDB"]
collection = db['Quest_Rep']
collection.index_information()


{'_id_': {'v': 2, 'key': [('_id', 1)]},
 'Id_1': {'v': 2, 'key': [('Id', 1)]},
 'PostTypeId_1': {'v': 2, 'key': [('PostTypeId', 1)]},
 'Topic_1': {'v': 2, 'key': [('Topic', 1)]},
 'Score_1': {'v': 2, 'key': [('Score', 1)]},
 'ParentId_1': {'v': 2, 'key': [('ParentId', 1)]},
 'AnswerCount_1': {'v': 2, 'key': [('AnswerCount', 1)]},
 'text_search': {'v': 2,
  'key': [('_fts', 'text'), ('_ftsx', 1)],
  'weights': SON([('Body', 1), ('Title', 2)]),
  'default_language': 'english',
  'language_override': 'language',
  'textIndexVersion': 3}}

In [3]:
topics= collection.distinct('Topic')
topics


['astronomy', 'datascience', 'earthscience', 'engineering', 'general', 'space']

In [4]:
nb_topics = len(topics)
nb_topics


6

In [5]:
from sklearn.preprocessing import LabelEncoder

# topics = ['astronomy', 'bicycles', 'earthscience', 'engineering',
#           'general', 'space', 'stellar']
label_encoder = LabelEncoder()
label_encoder.fit(topics)


## Datasets (corpus & target) generation

- Query MongoDB to create a list of questions (corpus)
- Select n questions for each topic => max_by_topic (if topic exists ...)
- Select all available questions in the topic (if not)


In [6]:
corpus = []
target = []
topic_max_questions = 5000

for topic in topics:

    query = {"Topic": topic, "PostTypeId": "1"}

    if collection.count_documents(query) >= topic_max_questions:

        questions = collection.find(query).sort([('Score', -1)]).limit(topic_max_questions)

        corpus.extend([question.get("Title") for question in questions])
        target.extend([topic] * topic_max_questions)

    else:
        length = collection.count_documents(query)

        questions = collection.find(query).sort([('Score', -1)]).limit(length)

        corpus.extend([question.get("Title") for question in questions])
        target.extend([topic] * length)


In [7]:
corpus = np.array(corpus)
target = np.array(target)


In [8]:
# Topics size
for topic in topics:
    query = {"Topic": topic, "PostTypeId": "1"}
    topic_size = collection.count_documents(query)
    print(topic, "\n\t\tsize =", topic_size, "\n")


astronomy 
		size = 9995 

datascience 
		size = 26279 

earthscience 
		size = 5393 

engineering 
		size = 10324 

general 
		size = 2619 

space 
		size = 14295 



In [9]:
# encode targets (str) to numerical (int)
y = label_encoder.transform(target)


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=stopwords)
X = vectorizer.fit_transform(corpus)
X.shape


(27619, 17721)

## Test some models

In [11]:
from sklearn.model_selection import ShuffleSplit

def run_pipes(pipes, splits, corpus, target, test_size=0.2, seed=42):  
    res = defaultdict(list)
    spliter = ShuffleSplit(n_splits=splits, test_size=test_size, random_state=seed)

    for idx_train, idx_test in spliter.split(corpus, target):

        for pipe in pipes:

            # Name of the model
            name = "-".join([x[0] for x in pipe.steps])
            
            # Split datasets
            X_train = corpus[idx_train]
            X_test = corpus[idx_test]
            y_train = target[idx_train]
            y_test = target[idx_test]
            
            # Train
            start = time()
            pipe.fit(X_train, y_train)
            fit_time = time() - start
            
            # Test & save results
            y = pipe.predict(X_test)
            res[name].append([
                fit_time,
                f1_score(y_test, y, average='micro'),
                f1_score(y_test, y, average='macro'),
                f1_score(y_test, y, average='weighted'),           
            ])
    return res


In [12]:
import pandas as pd

def print_table(res):
    # Compute mean & std
    final = {}
    for model in res:
        arr = np.array(res[model])
        final[model] = {
            "time (s)" : arr[:, 0].mean().round(2),
            "f1_av_micro": [arr[:,1].mean().round(3), arr[:,1].std().round(3)],
            "f1_av_macro": [arr[:,2].mean().round(3), arr[:,2].std().round(3)],
            "f1_av_weighted": [arr[:,3].mean().round(3), arr[:,3].std().round(3)],
        }

    df = pd.DataFrame.from_dict(final, orient="index").round(3)
    return df


In [13]:
from time import time
from sklearn.svm import LinearSVC
from collections import defaultdict
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import (SGDClassifier, LogisticRegression,
                                  LogisticRegressionCV)

# penalty => ‘l2’, ‘l1’, ‘elasticnet’
# loss => ‘hinge’, ‘log’, ‘modified_huber’, ‘squared_hinge’, ‘perceptron’
# regression loss => ‘squared_loss’, ‘huber’, ‘epsilon_insensitive’, or ‘squared_epsilon_insensitive’
pipe0 = Pipeline([
    ("ngram_stop", CountVectorizer(
        stop_words=stopwords, ngram_range=(1,2), min_df=2
    )),
    ('tfidf', TfidfTransformer()),
    ('sgd-weight_bal', SGDClassifier(
        max_iter=2000, class_weight='balanced',
        penalty='elasticnet', loss='modified_huber'
    )),
])

# penalty{‘l1’, ‘l2’} default='l2'
# loss{‘hinge’, ‘squared_hinge’} default='square_hinge'
# MUST => penalty='l1', loss = 'squared_hinge', dual=False
pipe1 = Pipeline([
    ("ngram_stop", CountVectorizer(
        stop_words=stopwords, ngram_range=(1,2), min_df=2
    )),
    ('tfidf', TfidfTransformer()),
    ('lin_svm-weight_bal', LinearSVC(
        class_weight='balanced', penalty='l1', loss='squared_hinge', dual=False
    )),
])

# penalty{‘l1’, ‘l2’, ‘elasticnet’, ‘none’}, dualbool, default=False
# solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’
pipe2 = Pipeline([
    ("ngram_stop", CountVectorizer(
        stop_words=stopwords, ngram_range=(1,2), min_df=2
    )),
    ('tfidf', TfidfTransformer()),
    ('lgr-weight_bal', LogisticRegression(
        max_iter=2000, penalty='l1',
        solver='liblinear', class_weight='balanced'
    )),
])

pipe3 = Pipeline([
    ("ngram_stop", CountVectorizer(
        stop_words=stopwords, ngram_range=(1,2), min_df=2
    )),
    ('tfidf', TfidfTransformer()),
    ('dtc-weight_bal', DecisionTreeClassifier(class_weight='balanced')),
])

pipe4 = Pipeline([
    ("ngram_stop", CountVectorizer(
        stop_words=stopwords, ngram_range=(1,2), min_df=2
    )),
    ('tfidf', TfidfTransformer()),
    ('compl_nb-weight_bal', ComplementNB()),
])

res = run_pipes(
    [pipe0, pipe1, pipe2, pipe3, pipe4], splits=10, corpus=corpus, target=y
)
print_table(res)




Unnamed: 0,time (s),f1_av_micro,f1_av_macro,f1_av_weighted
ngram_stop-tfidf-sgd-weight_bal,2.19,"[0.857, 0.005]","[0.863, 0.005]","[0.856, 0.005]"
ngram_stop-tfidf-lin_svm-weight_bal,4.05,"[0.851, 0.006]","[0.856, 0.006]","[0.85, 0.006]"
ngram_stop-tfidf-lgr-weight_bal,345.15,"[0.816, 0.004]","[0.822, 0.004]","[0.816, 0.004]"
ngram_stop-tfidf-dtc-weight_bal,20.85,"[0.719, 0.006]","[0.723, 0.006]","[0.716, 0.006]"
ngram_stop-tfidf-compl_nb-weight_bal,1.19,"[0.848, 0.003]","[0.853, 0.004]","[0.848, 0.003]"


In [14]:
import pickle

filename = "topic_classifier.pickle"
# Save
pickle.dump(pipe0, open(filename, 'wb'))


In [None]:
filename = "topic_classifier.pickle"
model = pickle.load(open(filename, 'rb'))


In [None]:
y_pred=model.predict(corpus)
score = model.score(corpus, y)

In [None]:
pred = model.predict(["What is your name ?"])
print(label_encoder.inverse_transform(pred))

In [None]:
liste_topic = list(label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6]))
liste_topic

In [None]:
pred_proba = model.predict_proba(["usb C"])

liste_proba = list(pred_proba[0])
liste_proba

In [None]:
liste_topic = list(label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6]))
pred_proba = model.predict_proba(["usb C"])
liste_proba = list(pred_proba[0])
rg = len(liste_proba)
for i in range(7):
    print(f'topic {i+1} :',liste_topic[np.argmax(liste_proba)])
    idx = np.argmax(liste_proba)
    print(idx)
    liste_proba.pop(idx)
    liste_topic.pop(idx)
    

In [None]:
dict_proba = {liste_topic[i]: liste_proba[i] for i in range(len(liste_proba))}

In [None]:
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt

cm = metrics.confusion_matrix(y, y_pred)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.yticks(np.arange(len(topics))+0.5, tuple(label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6])), rotation=0)
plt.ylabel('Actual label');
plt.xticks(np.arange(len(topics))+0.5, tuple(label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6])), rotation=60)
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0} %'.format(round(score,3)*100)
plt.title(all_sample_title, size = 15);
print(label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6]))

## Grid search

In [None]:
# from pipelinehelper import PipelineHelper
#
#pipe5 = Pipeline([
#    ('rbg2grey', RGB2GrayTransformer()),
#    ('hog', HogTransformer(pixels_per_cell=(14, 14), cells_per_block=(2, 2), 
#                           orientations=9, block_norm='L2-Hys')
#    ),
#    ('scaler', PipelineHelper([
#        ('std', StandardScaler()),
#        ('max', MaxAbsScaler()),
#    ])),
#    ('classifier', PipelineHelper([
#        ('sgd', SGDClassifier(random_state=42, max_iter=1000, tol=1e-3)),
#        ('svm', SVC()),
#        ('gnb', GaussianNB()),
#        ('cnb', ComplementNB()),
#    ])),
#])

pipe5 = Pipeline([
    ("vect", CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sgd', SGDClassifier()),
     ])

In [None]:
#params = {
#    'hog__orientations': [8, 9],
#    'hog__cells_per_block': [(2, 2), (3, 3), (4, 4)],
#    'hog__pixels_per_cell': [(8, 8), (10, 10), (12, 12)],
#    'scaler__selected_model': pipe.named_steps['scaler'].generate({
#        'std__with_mean': [True, False],
#        'std__with_std': [True, False],
#        'max__copy': [True],
#    }),
#    'classifier__selected_model': pipe.named_steps['classifier'].generate({
#        'sgd__loss': ['hinge', 'log', 'modified_huber', 'perceptron', 'squared_loss', 'huber'],
#        'svm__C': [0.1, 1.0],
#        'svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
#    })
#}

params ={
    'vect__stop_words': [None, 'english'], ## stopvords
    'vect__ngram_range': [(1,2), (1,3)], ## ngrams
    'vect__min_df': [2],
    #'tfidf_use_idf': [True, False],
    'sgd__alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0], ## learning rate
    'sgd__max_iter':[1000], ## nb of epochs
    'sgd__average': ['micro', 'macro', 'weighted'],
    'sgd__class_weight': ['micro', 'macro', 'balanced'], ## macro
    'sgd__loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 
                  'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'], ## model choice SVM, logistic reg...
    'sgd__penalty':['l2', 'l1', 'elasticnet']
}

## affichage des paramètre 

SGDClassifier().get_params().keys()

In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

params ={
    'vect__stop_words': ['english'], ## stopvords
    'vect__ngram_range': [(1,2)], ## ngrams
    'vect__min_df': [2],
    #'tfidf_use_idf': [True, False],
    'sgd__alpha': [ 1e-4, 1e-3], ## learning rate
    'sgd__max_iter':[1000], ## nb of epochs
    'sgd__average': ['micro', 'macro', 'weighted'],
    'sgd__class_weight': ['micro', 'macro', 'balanced'], ## macro
    'sgd__loss': ['hinge', 'modified_huber','squared_epsilon_insensitive'], ## model choice SVM, logistic reg...
    'sgd__penalty':['elasticnet']
}

#f1_scorer = make_scorer(f1_score , average='macro')# Ne marche pas, utiliser onevsrestClassif ou onevsone
gs = GridSearchCV(pipe5, 
                  params, 
                  cv=3,
                  n_jobs=-1,
                  scoring='accuracy',
                  verbose=1,
                  return_train_score=True)
 
gs_res = gs.fit(corpus, y)

In [None]:
print('meilleur score obtenu :', gs_res.best_score_)
print('meilleur estimateur :', gs_res.best_estimator_)
print('meilleur paramètres :', gs_res.best_params_)