## Etape 5 Okapi BM 25

In [117]:
import sklearn,scipy,rank_bm25
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split as train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import export_graphviz
import pydotplus

from scipy import sparse
from rank_bm25 import BM25Okapi,BM25L,BM25Plus

import numpy as np

### Pour inverser le tableau afin de correspondre à la taille. 
def bm25_fit_transform(features,corpus):# Un corpus tokenisé
    bm25 = BM25Okapi(corpus,k1=1.8,b=0.75)
    tmp = []
    for x in features:
        tmp.append(bm25.get_scores([x]))
    X = [[0 for x in range(len(features))] for y in range(len(tmp[0]))]
    for i in range(len(tmp[0])):
        for j in range(len(tmp)):
            X[i][j]=tmp[j][i]
    return np.asarray(X)


class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

### Ouvrir les tweets et les tweets de test

In [9]:
import os

def ouvrir(dossier):
    données = []
    g = os.walk(dossier)
    for path,dir_list,file_list in g:  
        for file_name in file_list:  
            with open(os.path.join(path, file_name),'r',encoding="utf-8") as f:
                g1 = f.readline()
                while g1!="":
                    données.append(g1.strip())
                    g1 = f.readline()
    return données

mixed = ouvrir("mixed")
negatif = ouvrir("negative")
objectif = ouvrir("objective")
positif = ouvrir("positive")

print(len(mixed))
print(len(negatif))
print(len(objectif))
print(len(positif))

import json
with open("task1-testGold.csv.json",'r') as load_f:
     load_dict = json.load(load_f)

objectif_test = []
pos_test = []
neg_test = []
for x in load_dict["objective"]:
    objectif_test.append(x['content'])
for x in load_dict["positive"]:
    pos_test.append(x['content'])
for x in load_dict["negative"]:
    neg_test.append(x['content'])
    
print(len(objectif_test))
print(len(pos_test))
print(len(neg_test))

501
1268
1643
494
411
123
318


### Prétraitement / BaseLine

In [12]:
import nltk
from nltk.tokenize import word_tokenize

def tokeniser(l):
    données = []
    for x in l:
        données.append(word_tokenize(x,language="french"))
    return données

mixed_tokeniser = tokeniser(mixed)
negatif_tokeniser = tokeniser(negatif)
objectif_tokeniser = tokeniser(objectif)
positif_tokeniser = tokeniser(positif)
pos_test_tokeniser = tokeniser(pos_test)
neg_test_tokeniser = tokeniser(neg_test)
obj_test_tokeniser = tokeniser(objectif_test)

    
print(len(mixed_tokeniser))
print(len(negatif_tokeniser))
print(len(objectif_tokeniser))
print(len(positif_tokeniser))
print(len(pos_test_tokeniser))
print(len(neg_test_tokeniser))
print(len(obj_test_tokeniser))

501
1268
1643
494
123
318
411


### Extraire des features pour l'entraînement

In [70]:
Corpus = positif+negatif+objectif+pos_test+neg_test+objectif_test
Corpus_tokens = positif_tokeniser+negatif_tokeniser+objectif_tokeniser+pos_test_tokeniser+neg_test_tokeniser+obj_test_tokeniser

TFIDF_Vect = TfidfVectorizer(encoding = "utf-8",lowercase = True,stop_words=stop_words_fr,norm=None, smooth_idf=False)
X1 = TFIDF_Vect.fit_transform(Corpus)

features = TFIDF_Vect.get_feature_names()
print(len(features))

13637


### Vectorisation avec Okapi BM25

In [74]:
bm25 = BM25()
bm25.fit(Corpus)
print(features[3910])
l = bm25.transform(features[3910],Corpus)
print(len(l))

contrôle
4257


In [76]:
columns = []
for x in features:
    columns.append(bm25.transform(x,Corpus))
print(len(columns))
columns = np.asarray(columns)
print(columns.shape)

13637


In [80]:
X1 = [[0 for x in range(len(features))] for y in range(len(columns[0]))]
for i in range(len(columns[0])):
    for j in range(len(columns)):
        X1[i][j]=columns[j][i]
X1 = np.asarray(X1) 
print(X1.shape)

(4257, 13637)


In [85]:
X1_train = X1[:len(positif+negatif+objectif)]
X1_test = X1[len(positif+negatif+objectif):]
Y1_train = np.asarray(["positif" for x in positif]+["negatif" for x in negatif]+["objectif" for x in objectif])
Y1_test = np.asarray(["positif" for x in pos_test]+["negatif" for x in neg_test]+["objectif" for x in objectif_test])
print(X1_train.shape,Y1_train.shape)
print(X1_test.shape,Y1_test.shape)

(3405, 13637) (3405,)
(852, 13637) (852,)


### Test Final

##### J48

In [105]:
DT = DecisionTreeClassifier()
DT = DT.fit(X1_train,Y1_train)

Okapi_BM25_DecisionTree = DT.score(X1_test,Y1_test)

###### Cross-Validation avec J48 

In [88]:
Scores = cross_val_score(DT,X1,Y,cv = 10, scoring = 'accuracy')
Okapi_BM25_DecisionTree_CV = Scores.mean()

##### Visualisation en arbre en PDF

In [119]:
dot_data = tree.export_graphviz(DT,out_file=None,
                                feature_names=features,
                                class_names=['positif','negatif','objectif'],
                                filled=True,rounded=True,
                                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("Okapi BM 25 J48 Arbre.pdf")

True

##### Random Forest

In [115]:
RF = RandomForestClassifier()
RF = RF.fit(X1_train,Y1_train)

Okapi_BM25_RandomForest = RF.score(X1_test,Y1_test)



##### Cross-Validation avec Random Forest 

In [97]:
Scores = cross_val_score(RF,X1,Y,cv = 10, scoring = 'accuracy')
Okapi_BM25_RandomForest_CV = Scores.mean()

##### Montrer le résultat final

In [116]:
import pandas as pd
print("Test final : ")
pd.DataFrame({"J48":[Okapi_BM25_DecisionTree,Okapi_BM25_DecisionTree_CV],"Random Forest":[Okapi_BM25_RandomForest,Okapi_BM25_RandomForest_CV]},index = ["Test Final","Cross Validation"])

Test final : 


Unnamed: 0,J48,Random Forest
Test Final,0.66784,0.710094
Cross Validation,0.68383,0.722322
