## Etape 1 Prétraitement/Baseline

### Ouvrir les tweets

In [21]:
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize

import os

def ouvrir(dossier):
    données = []
    g = os.walk(dossier)
    for path,dir_list,file_list in g:  
        for file_name in file_list:  
            with open(os.path.join(path, file_name),'r',encoding="utf-8") as f:
                g1 = f.readline()
                while g1!="":
                    données.append(g1.strip())
                    g1 = f.readline()
    return données

mixed = ouvrir("mixed")
negatif = ouvrir("negative")
objectif = ouvrir("objective")
positif = ouvrir("positive")

print(len(mixed))
print(len(negatif))
print(len(objectif))
print(len(positif))

501
1268
1643
494


### Tokenisation

In [22]:
import nltk
from nltk.tokenize import word_tokenize

def tokeniser(l):
    données = []
    for x in l:
        données.append(word_tokenize(x,language="french"))
    return données

mixed_tokeniser = tokeniser(mixed)
negatif_tokeniser = tokeniser(negatif)
objectif_tokeniser = tokeniser(objectif)
positif_tokeniser = tokeniser(positif)

    
print(len(mixed_tokeniser))
print(len(negatif_tokeniser))
print(len(objectif_tokeniser))
print(len(positif_tokeniser))

501
1268
1643
494


### Stop Word fr

In [23]:
stop_words_fr = []
with open("../../../ZijianNLP/stop_words_fr.txt",'r',encoding="utf-8") as f:
    g = f.readline()
    while g!="":
        stop_words_fr.append(g.strip())
        g = f.readline()
        
print(stop_words_fr[:10])

['alors', 'au', 'aucuns', 'aussi', 'autre', 'avant', 'avec', 'avoir', 'bon', 'car']


## Etape 2 Cross Validation sur TRAIN

In [24]:
def vectorisation(l,stop_words_list):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(encoding = "utf-8",strip_accents='unicode',lowercase = True,stop_words=stop_words_list)
    return vectorizer.fit_transform(l).toarray()

#### Préparer les données pour la vectorisation

In [25]:
negatif = [" ".join(x) for x in negatif_tokeniser]
positif = [" ".join(x) for x in positif_tokeniser]
objectif = [" ".join(x) for x in objectif_tokeniser]
mixed = [" ".join(x) for x in mixed_tokeniser]
print(negatif[0])

Que reste-t-il des anaphores de `` Moi , président de la république '' ... ? http : //fb.me/7But7qGNb


#### Vectorisation par CountVector

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(encoding = "utf-8",lowercase = True,stop_words=stop_words_fr)
neg = vectorizer.fit_transform(negatif)
pos = vectorizer.fit_transform(positif)
obj = vectorizer.fit_transform(objectif)
mix = vectorizer.fit_transform(mixed)
neg_pos = vectorizer.fit_transform(negatif+positif)
T = positif+negatif+objectif
pos_neg_obj = vectorizer.fit_transform(T)

import numpy as np
positif_array = pos.toarray()
negatif_array = neg.toarray()
objectif_array = obj.toarray()
commentaires = pos_neg_obj.toarray()
print(positif_array.shape)
print(negatif_array.shape)
print(objectif_array.shape)
print(commentaires.shape)

X = commentaires
print(X.shape)
Y = ["positif" for x in range(pos.shape[0])]+["negatif" for x in range(neg.shape[0])]+["objectif" for x in range(obj.shape[0])]
Y = np.asarray(Y)
print(Y.shape)

(494, 2225)
(1268, 5976)
(1643, 7156)
(3405, 11781)
(3405, 11781)
(3405,)


#### Visualisation des features après CountVectorizer

In [11]:
import pandas as pd
print("Un extrait des features")
pd.DataFrame({"Quelques Features":vectorizer.get_feature_names()[4000:4020],"Scores for the 200ist tweet":X[200][4000:4020]},index = ["feature {}".format(i) for i in range(1,21)])

Un extrait des features


Unnamed: 0,Quelques Features,Scores for the 200ist tweet
feature 1,dorigine,0
feature 2,dormir,0
feature 3,doru,0
feature 4,dorée,0
feature 5,dose,0
feature 6,dossier,0
feature 7,douanier,0
feature 8,double,0
feature 9,doublé,0
feature 10,doubs,0


### Pour la Cross-Validation

##### Cross-Validation J48

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split as train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

X = commentaires
print(X.shape)
Y = ["positif" for x in range(pos.shape[0])]+["negatif" for x in range(neg.shape[0])]+["objectif" for x in range(obj.shape[0])]
Y = np.asarray(Y)
print(Y.shape)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state = 4)

clf = DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)

clf_y_pred = clf.predict(X_test)
treeScore = accuracy_score(Y_test,clf_y_pred)

print("Decision Tree : ",treeScore)

(3405, 11781)
(3405,)
Decision Tree :  0.6713615023474179


In [28]:
scores = cross_val_score(clf,X,Y,cv = 10, scoring = 'accuracy')
print(scores)
print(scores.mean())
Decision_Tree_CountVector = scores.mean()

[0.70175439 0.71345029 0.66666667 0.73607038 0.66764706 0.64705882
 0.67647059 0.70294118 0.72566372 0.7020649 ]
0.6939787986889057


##### Visualisation en PDF

In [453]:
dot_data = tree.export_graphviz(clf,out_file=None,
                                feature_names=vectorizer.get_feature_names(),
                                class_names=['positif','negatif','objectif'],
                                filled=True,rounded=True,
                                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("Countvectorizer Arbre.pdf")

True

##### Cross-Validation KNN

In [137]:
from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier()

knn.fit(X_train,Y_train)

print(knn.score(X_test,Y_test))

0.5481220657276995


In [161]:
score = cross_val_score(knn,X,Y,cv = 5, scoring = 'accuracy')
print(score)
print(score.mean())

[0.61143695 0.62316716 0.60557185 0.59558824 0.59351988]
0.6058568141105946


In [164]:
KNN_CountVector = score.mean()

##### Cross-Validation par Naive Bayes

In [425]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

mnb = MultinomialNB(alpha=0.01).fit(X_train,Y_train)
mnb_y_pred = mnb.predict(X_test)
mnbScore = accuracy_score(Y_test,mnb_y_pred)
score = cross_val_score(mnb,X,Y,cv = 10,scoring = "accuracy")

NB_CountVector = score.mean()
print("Naïve Bayes : ",NB_CountVector)

Naïve Bayes :  0.6848481069094847


##### Cross-Validation Random Forest

In [458]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()
RF = RF.fit(X_train,Y_train)

RFScore = RF.score(X_test,Y_test)

print(RFScore)
score = cross_val_score(RF,X,Y,cv = 10,scoring = "accuracy")

RF_CountVector = score.mean()
print("Random Forest : ",RF_CountVector)



0.6889671361502347
Random Forest :  0.7165786163426002


### Conclusion avec la cross-validation

In [459]:
import pandas as pd

print("CountVectorizer : ")
pd.DataFrame({"KNN":KNN_CountVector,"Naïve Bayes":NB_CountVector,"Decision Tree":Decision_Tree_CountVector,"Random_Forest":RF_CountVector},index=["CountVectorizer"])

CountVectorizer : 


Unnamed: 0,KNN,Naïve Bayes,Decision Tree,Random_Forest
CountVectorizer,0.605857,0.684848,0.691948,0.716579


## Etape 3. Cross Valid sur TEST 

-- Voir s'il existe du sur-apprentissage évident. 0%? 100%?

#### Ouvrir le test de tweet en json

In [12]:
import json
with open("task1-testGold.csv.json",'r') as load_f:
     load_dict = json.load(load_f)

objectif_test = []
pos_test = []
neg_test = []
for x in load_dict["objective"]:
    objectif_test.append(x['content'])
for x in load_dict["positive"]:
    pos_test.append(x['content'])
for x in load_dict["negative"]:
    neg_test.append(x['content'])

In [16]:
from sklearn import tree
from sklearn.model_selection import train_test_split as train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


vectorizer = CountVectorizer(encoding = "utf-8",lowercase = True,stop_words=stop_words_fr)
X = vectorizer.fit_transform(pos_test+neg_test+objectif_test)
Y = ["positif" for x in range(len(pos_test))]+["negatif" for x in range(len(neg_test))]+["objectif" for x in range(len(objectif_test))]
X = X.toarray()
Y = np.asarray(Y)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state = 4)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)

clf_y_pred = clf.predict(X_test)
treeScore = accuracy_score(Y_test,clf_y_pred)

score = cross_val_score(clf,X,Y,cv = 10, scoring = 'accuracy')
print(score)
print(score.mean())
CountVector_Test_DecisionTree = score.mean()

knn = KNeighborsClassifier()
knn.fit(X_train,Y_train)
print(knn.score(X_test,Y_test))

score = cross_val_score(knn,X,Y,cv = 10, scoring = 'accuracy')
print(score)
print(score.mean())
CountVector_Test_KNN = score.mean()

mnb = MultinomialNB(alpha=0.01).fit(X_train,Y_train)
mnb_y_pred = mnb.predict(X_test)
mnbScore = accuracy_score(Y_test,mnb_y_pred)

print(mnbScore)

score = cross_val_score(mnb,X,Y,cv = 10,scoring = "accuracy")

CountVector_Test_NB = score.mean()
print(score.mean())

RF = RandomForestClassifier()
RF = RF.fit(X_train,Y_train)

score = cross_val_score(RF,X,Y,cv = 10,scoring = "accuracy")

RF_CountVectorize_test = score.mean()

[0.56321839 0.62790698 0.62790698 0.47058824 0.62352941 0.64705882
 0.65882353 0.63529412 0.60714286 0.29761905]
0.5759088366701932
0.460093896713615
[0.37931034 0.58139535 0.48837209 0.38823529 0.52941176 0.57647059
 0.57647059 0.58823529 0.53571429 0.20238095]
0.4845996554195054
0.5774647887323944
0.48593335489769274




### Pas de grande surprise pour les données de tests

In [18]:
print("Cross-Validation pour les données de test pour éviter le sur-apprentissage: ")
pd.DataFrame({"KNN":[CountVector_Test_KNN],"Naïve_Bayes":[CountVector_Test_NB],"J48":[CountVector_Test_DecisionTree],"Random Forest":[RF_CountVectorize_test]},index = ["CountVector"])

Cross-Validation pour les données de test pour éviter le sur-apprentissage: 


Unnamed: 0,KNN,Naïve_Bayes,J48,Random Forest
CountVector,0.4846,0.485933,0.575909,0.608009


## Etape 4. Vectorisation avec TFIDF et Implimentation

Rather than just counting, we can use the TF-IDF score of a word to rank it's importance

TFIDF score of a word, w, is

tf(w) * idf(w)

Where 

tf(w) = (Number of times the word appears in a document) / (Total number of words in the document)

idf(w) = log(Number of documents / Number of documents that contain word w)

### Utiliser Sklearn.features_extraction Tfidf

In [427]:
print(positif[0])
print(negatif[0])
print(objectif[0])

Ouf je craignais être le plus vieux candidat à une élection sous la 5éme République ... mais @ alainjuppe remporte la palme ! Bravo ! # NB2017
Que reste-t-il des anaphores de `` Moi , président de la république '' ... ? http : //fb.me/7But7qGNb
# Russie - # Poutine : `` Le projet de grande # Eurasie est bien évidemment ouvert aux pays de l ’ # Europe '' http : //fb.me/62oFModNn


In [460]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(encoding = "utf-8",lowercase = True,stop_words=stop_words_fr)

X = tfidf_vect.fit_transform(positif+negatif+objectif)

#X = tfidf_vect.transform(positif+negatif+objectif)
X = X.toarray()
print(X.shape)

Y = ["positif" for x in range(len(positif))]+["négatif" for x in range(len(negatif))]+["objectif" for x in range(len(objectif))]
Y = np.asarray(Y)
print(Y.shape)

(3405, 11781)
(3405,)


### Visualiser quelques features après TFIDF

In [443]:
pd.DataFrame({"Quelques Features":vectorizer.get_feature_names()[4000:4020],"Scores for the 200ist tweet":X[200][4000:4020]},index = ["feature {}".format(i) for i in range(1,21)])

Unnamed: 0,Quelques Features,Scores for the 200ist tweet
feature 1,dorigine,0.0
feature 2,dormir,0.0
feature 3,doru,0.0
feature 4,dorée,0.0
feature 5,dose,0.0
feature 6,dossier,0.0
feature 7,douanier,0.0
feature 8,double,0.0
feature 9,doublé,0.0
feature 10,doubs,0.0


### Entraîner par des modèles

#####  Decision Tree

In [457]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state = 4)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)

clf_y_pred = clf.predict(X_test)
treeScore = accuracy_score(Y_test,clf_y_pred)

print("IFIDF Decision Tree : ",treeScore)

# Cross-Validation
score = cross_val_score(clf,X,Y,cv = 10, scoring = 'accuracy')
print(score)
print(score.mean())
TFIDF_DecisionTree = score.mean()

IFIDF Decision Tree :  0.6772300469483568
[0.71052632 0.72222222 0.65789474 0.73020528 0.65       0.64705882
 0.67352941 0.72058824 0.72271386 0.69616519]
0.693090408008161


#### Visualisation du résultat de DécisionTree

In [256]:
from sklearn.tree import export_graphviz
import pydotplus

In [445]:
dot_data = tree.export_graphviz(clf,out_file=None,
                                feature_names=tfidf_vect.get_feature_names(),
                                class_names=['positif','negatif','objectif'],
                                filled=True,rounded=True,
                                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("TFIDF DecisionTree Arbre.pdf")

True

#### KNN

In [220]:
knn = KNeighborsClassifier()

knn.fit(X_train,Y_train)

print(knn.score(X_test,Y_test))

0.6678403755868545


In [225]:
score = cross_val_score(knn,X,Y,cv = 10, scoring = 'accuracy')
print(score)
print(score.mean())
TFIDF_KNN = score.mean()

[0.70760234 0.65497076 0.67251462 0.69501466 0.65294118 0.69411765
 0.66176471 0.7        0.7079646  0.68141593]
0.6828306442440061


##### Naïve Bayes

In [297]:
mnb = MultinomialNB(alpha=0.01).fit(X_train,Y_train)
mnb_y_pred = mnb.predict(X_test)
mnbScore = accuracy_score(Y_test,mnb_y_pred)

print(mnbScore)

score = cross_val_score(mnb,X,Y,cv = 10,scoring = "accuracy")

TFIDF_NB = score.mean()
print("Naïve Bayes : ",TFIDF_NB)

0.653755868544601
Naïve Bayes :  0.6860374748725467


##### Random Forest

In [463]:
RF = RandomForestClassifier()
RF = RF.fit(X_train,Y_train)

RFScore = RF.score(X_test,Y_test)

print(RFScore)
score = cross_val_score(RF,X,Y,cv = 10,scoring = "accuracy")

RF_TFIDF = score.mean()
print("Random Forest : ",RF_TFIDF)



0.710093896713615
Random Forest :  0.7200951154220886


###  Résultats Obtenus Avec la cross-validation

In [464]:
print("Cross-Validation pour l'apprentissage: ")
pd.DataFrame({"KNN":[KNN_CountVector,TFIDF_KNN],"Naïve Bayes":[NB_CountVector,TFIDF_NB],"J48":[Decision_Tree_CountVector,TFIDF_DecisionTree],"Random Forest":[RF_CountVector,RF_TFIDF]},index=["CountVectorizer","TFIDF"])

Cross-Validation pour l'apprentissage: 


Unnamed: 0,KNN,Naïve Bayes,Decision Tree,Random Forest
CountVectorizer,0.605857,0.684848,0.691948,0.716579
TFIDF,0.682831,0.686037,0.69309,0.720095


### Donc, nous utilisons Decision Tree et Random Forest pour un test avec TFIDF qui se montre un peu meilleur que les autres.

### le score final pour le CountVectoriser, TFIDF

### Nous choisissons J48 et Random Forest

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
print(len(positif)," ",len(pos_test))
print(len(negatif)," ",len(neg_test))
print(len(objectif)," ",len(objectif_test))

Total = positif+negatif+objectif+pos_test+neg_test+objectif_test
Target = ["positif" for x in range(len(positif))]+["negatif" for x in range(len(negatif))]+["objectif" for x in range(len(objectif))]+["positif" for x in range(len(pos_test))]+["negatif" for x in range(len(neg_test))]+["objectif" for x in range(len(objectif_test))]
print(len(Total))
print(len(Target))

vectorizer = CountVectorizer(encoding = "utf-8",lowercase = True,stop_words=stop_words_fr)

X = vectorizer.fit_transform(Total)
X_train = X[:(len(positif)+len(negatif)+len(objectif))]
X_test = X[(len(positif)+len(negatif)+len(objectif)):]

Y_train = Target[:3405]
Y_test = Target[3405:]

Y_train = np.asarray(Y_train)
Y_test = np.asarray(Y_test)

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

# J48
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)

clf_y_pred = clf.predict(X_test)
treeScore = accuracy_score(Y_test,clf_y_pred)

Count_vect_J48 = treeScore

print(Count_vect_J48)

RF = RandomForestClassifier()
RF = RF.fit(X_train,Y_train)

Count_vect_RF = RF.score(X_test,Y_test)

tfidf_vect = TfidfVectorizer(encoding = "utf-8",lowercase = True,stop_words=stop_words_fr)

X = tfidf_vect.fit_transform(Total)
X_train = X[:(len(positif)+len(negatif)+len(objectif))]
X_test = X[(len(positif)+len(negatif)+len(objectif)):]

Y_train = Target[:3405]
Y_test = Target[3405:]

Y_train = np.asarray(Y_train)
Y_test = np.asarray(Y_test)

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)

clf_y_pred = clf.predict(X_test)
treeScore = accuracy_score(Y_test,clf_y_pred)

TFIDF_J48= treeScore

print(TFIDF_J48)

RF = RandomForestClassifier()
RF = RF.fit(X_train,Y_train)

TFIDF_RF = RF.score(X_test,Y_test)

494   123
1268   318
1643   411
4257
4257
(3405, 13638) (3405,)
(852, 13638) (852,)
0.6549295774647887




(3405, 13638) (3405,)
(852, 13638) (852,)
0.6866197183098591




### Le résultat global pour CountVect et TFIDF

In [32]:
pd.DataFrame({"J48":[Count_vect_J48,Count_vect_RF],"Random Forest":[TFIDF_J48,TFIDF_RF]},index=["CountVectore","TFIDF"])

Unnamed: 0,J48,Random Forest
CountVectore,0.65493,0.68662
TFIDF,0.70892,0.705399


## Etape 5. Okapi BM 25 

Dans le deuxième jupyter