# HMIN232M - Automatic fact-checking

```
Sabri BENBRAHIM - 21604014
Bénédicte DAYNAC - 21605192
Yann DUFRESNE - 20055179
Llivia LANGEVIN - 21604582
```

### Imports globaux préalables

In [None]:
import nltk as nltk

# pour colab
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

# pour colab
!pip install contractions

import contractions

import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import scipy
import unicodedata
from time import time

import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.utils import resample


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from scipy.stats import randint

#remove warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

#python version 2 or 3
import sys
if sys.version_info[0] < 3:
    print("python2")
else:
    print("python3")
    
#python architecture 32 or 64 bits
import platform
print(platform.architecture()[0])

#run garbage collector
import gc
gc.collect()

#free memory size
import psutil
print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)))

### Fonctions elémentaires récupérées

In [None]:
#remove html (clean)
from bs4 import BeautifulSoup
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# replave contractions (clean)
import contractions
def replace_contractions(text):
    return contractions.fix(text)

# replace numbers to words (token)
import inflect
def numbers_to_words(tokens):
    result=[]
    p = inflect.engine()
    for t in tokens:
        if t.isdigit():
            t = p.number_to_words(t)
        result.append(t)
    return result

print("ready")

## Importation du jeu de données

In [None]:
dataFile="data-truefalse.csv" # data-truefalse-1.csv data-truefalse-2.csv data-truefalse-3.csv data-truefalse.csv data-mixture.csv

XColumnName= "claimReview_claimReviewed"
yColumnName= "true_false_mixture"

print("Chargement CSV: ",dataFile)
dfOrigin=pd.read_csv(dataFile, sep='\t')

# ! necessaire pour mixture, a mettre en commentaire pour truefalse !
#dfOrigin.loc[dfOrigin[yColumnName] == -1, yColumnName] = 1

display(dfOrigin.head())
print("dfOrigin taille:",dfOrigin.shape,'\n')

gc.collect()
print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)))

## Nettoyage et prétraitements

### sur les claims

#### suppression caractères non utf8, html, formes anglaise contractées

In [None]:
def preclean_data(df):
    
    # print("preclean claims")

    # clean encodage
    index = df.index.values
    for i in index:
        if not pd.isnull(df.at[i]):
            df.at[i]=unicodedata.normalize('NFKD', df.at[i]).encode('ascii','ignore').decode('utf-8','ignore')
            # c=unicodedata.normalize('NFKD', unicode(str(df.at[i])).encode('ascii','ignore') #python2

    # remove html
    for i in index:
        if not pd.isnull(df.at[i]):
            df.at[i]=strip_html(df.at[i])
    
    # replace contractions
    for i in index:
        if not pd.isnull(df.at[i]):
            df.at[i]=replace_contractions(df.at[i])
    
    return df

In [None]:
claimsClean=dfOrigin[XColumnName].copy()

print("Avant:")
print(claimsClean.head(),'\n')

claimsClean = preclean_data(claimsClean)

print("Apres Clean:")
print(claimsClean.head(),'\n')
print("claimsClean taille:",claimsClean.shape,'\n')

gc.collect()
print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)))

### sur les tokens

#### passage en  minuscule, convertion des chiffres en lettres, suppression des caractères spéciaux

In [None]:
def token_data(df):
    
    # print("clean tokens")

    df = df.astype('object')
    index = df.index.values
    for i in index:
        tokensClaimsResult=[]
        if not pd.isnull(df.at[i]):
            phrases = sent_tokenize(df.at[i])
            for p in phrases:
                tokens = word_tokenize(p)
                # minuscule
                tokens = [t.lower() for t in tokens]
                # replace number to letters
                tokens = numbers_to_words(tokens)
                # remove non-alpha signs
                tokens = [t for t in tokens if t.isalpha()]
                for t in tokens:
                    tokensClaimsResult.append(t) 
            df.at[i]=tokensClaimsResult
        else:
            df.at[i]=[]
    
    return df

In [None]:
claimsTokens=claimsClean.copy()

print("Avant:")
print(claimsTokens.head(),'\n')

claimsTokens = token_data(claimsTokens)

print("Apres Token:")
print(claimsTokens.head(),'\n')
print("claimsTokens taille:",claimsTokens.shape,'\n')

gc.collect()
print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)))

## Merge des tokens

In [None]:
def merge_tokens(df):
    
    # print("merge tokens")

    claimsToStr=[]
    index = df.index.values
    for i in index:
        line=""
        if df.at[i] != []:
            for w in df.at[i]:
                line+=" "+w
        df.at[i]=line.strip()    
    
    return df

In [None]:
claimsMergeInit=claimsTokens.copy()

print("Avant:")
print(claimsMergeInit.head(),'\n')

claimsMergeInit = merge_tokens(claimsMergeInit)

print("Apres Merge:")
print(claimsMergeInit.head(),'\n')
print("claimsMerge taille:",claimsMergeInit.shape,'\n')

gc.collect()
print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)))

## Resultat sur un premier classifieur

In [None]:
def do_classifier(dfComplet, column, dfEval, trace):

    t0 = time()

    # Vectorisation
    vectorizerT = TfidfVectorizer(min_df=2)
    vectorT = vectorizerT.fit_transform(dfEval)

    gc.collect()
    if trace:
        print("Vocabulary:")
        i=0
        limit=50
        for key, value in vectorizerT.vocabulary_.items():
            print (key, end=', ')
            i+=1
            if i>= limit:
                break
        print("\nTfidVector taille: ",vectorT.shape)
        print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)),"\n")

    # Jeux d'apprentissage
    X=vectorT.toarray()
    y=dfComplet[column].copy()
    gc.collect()
    if trace:
        print ("X taille: ",X.shape)
        print ("y taille: ",y.shape)
        print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)),"\n")

    validation_size=0.25
    testsize= 1-validation_size
    X_train,X_test,y_train,y_test=train_test_split(X, 
                                                   y, 
                                                   train_size=validation_size, 
                                                   random_state=20,
                                                   test_size=testsize)
    gc.collect()
    if trace:
        print ("X_train taille: ",X_train.shape)
        print ("X_test taille: ",X_test.shape)
        print ("y_train taille: ",y_train.shape)
        print ("y_test taille: ",y_test.shape)
        print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)),"\n")

    # Classifieurs par defaut
    
    # GaussianNB
    #clfN="GaussianNB"
    #print(clfN)
    #clf = GaussianNB()
    
    # LinearSVC
    clfN="LinearSVC"
    print(clfN)
    #clf = LinearSVC()
    clf = CalibratedClassifierCV(LinearSVC()) 

    # Execution et resultats
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision,recall,fscore,support=score(y_test,y_pred,average='macro')

    print("Accuracy: %.3f%%" % (accuracy * 100.0))
    print("F1-score moyen: %.3f%%\n" % (fscore * 100.0))
    print ('Matrice de confusion:\n', confusion_matrix(y_test, y_pred),'\n')
    print (classification_report(y_test, y_pred))

    # Roc Curve
    ns_probs = [0 for _ in range(len(y_test))]
    lr_probs = clf.predict_proba(X_test)
    lr_probs = lr_probs[:, 1]
    ns_auc = roc_auc_score(y_test, ns_probs)
    lr_auc = roc_auc_score(y_test, lr_probs)
    ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

    print('Sans modèle : ROC AUC =%.3f' % (ns_auc))
    print('Avec',clfN,' : ROC AUC =%.3f' % (lr_auc))
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Pas de Modele')
    plt.plot(lr_fpr, lr_tpr, marker='.', label=clfN)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

    print("Réalisé en %.1fs" % (time() - t0))
    
    del vectorT, X, y, X_train, X_test, y_train, y_test
    #del ns_probs, lr_probs, ns_auc, lr_auc, ns_fpr, ns_tpr, lr_fpr, lr_tpr
    print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)),"\n")
    
    return fscore    

In [None]:
# Première valeur de score de référence
scoreInit = do_classifier(dfOrigin, yColumnName, claimsMergeInit, True)

## Etude sur la suppression des Stop words

In [None]:
def stop_data(df, trace):
    
    if trace:
        print("remove stopwords")

    stop_words = set(stopwords.words('english'))
    # stop_words = set(stopwords.words('french')) #fr?
    index = df.index.values
    for i in index:
        if df.at[i] != []:
            withoutStopWordsClaims=[]
            withoutStopWordsClaims = [w for w in df.at[i] if not w in stop_words]
            df.at[i]=withoutStopWordsClaims
    
    return df

In [None]:
claimsStop=claimsTokens.copy()

print("Avant:")
print(claimsStop.head(),'\n')

claimsStop = stop_data(claimsStop, True)

print("Apres Stop:")
print(claimsStop.head(),'\n')
print("claimsStop taille:",claimsStop.shape,'\n')

gc.collect()
print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)))

In [None]:
StopWords= False # False True

claimsMergeStop = merge_tokens(claimsStop.copy())
scoreStop = do_classifier(dfOrigin, yColumnName,claimsMergeStop, False)

# Memorisation des parametres en fonction de l'évolution du score
if scoreStop >= scoreInit:
    StopWords= True

## Etude sur la Lemmatisation

In [None]:
def lemm_data(df, trace):
    
    if trace:
        print("lemmatize tokens")

    lemmatizer = WordNetLemmatizer()
    index = df.index.values
    for i in index:
        if df.at[i] != []:
            df.at[i]=[lemmatizer.lemmatize(word,pos='v') for word in df.at[i]]

    return df

In [None]:
claimsLem=claimsTokens.copy()

print("Avant:")
print(claimsLem.head(),'\n')

claimsLem = lemm_data(claimsLem, True)

print("Apres Lemm:")
print(claimsLem.head(),'\n')
print("claimsLem taille:",claimsLem.shape,'\n')

gc.collect()
print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)))

In [None]:
Lemmatiz= False # False True

claimsMergeLem = merge_tokens(claimsLem.copy())
scoreLem = do_classifier(dfOrigin, yColumnName, claimsMergeLem, False)

# Memorisation des parametres en fonction de l'évolution du score
if StopWords:
    if scoreLem >= scoreStop:
        Lemmatiz= True
else:
    if scoreLem >= scoreInit:
        Lemmatiz= True

In [None]:
# Nettoyage complet paramétrable sans merge
def fullclean_data(column, stop, lemm, trace):

    result = preclean_data(column)
    result = token_data(result)
    if stop:
        result = stop_data(result, trace)
    if lemm:
        result = lemm_data(result, trace)

    return result

## Etude sur l'ajout d'extras

### Creation des jeux de données avec ajouts de colonnes

In [None]:
def addtokens_2in1(df1,df2):
    
    index = df1.index.values
    for i in index:
        line=[]
        if df1.at[i] != []:
            for w in df1.at[i]:
                line.append(w)
        if df2.at[i] != []:
            for w in df2.at[i]:
                line.append(w)
        df1.at[i]=line  
        
    return df1

In [None]:
def addExtras(df, column, extraType, stop, lemm):

    if extraType == "addauthor" or extraType == "addall":
        print("Wait...")
        print("Clean author extras")
        extra1=df['creativeWork_author_name'].copy()
        extra2=df['extra_author_categories'].copy()
        extra1 = fullclean_data(extra1, stop, lemm, False)
        extra2 = fullclean_data(extra2, stop, lemm, False)

    if extraType == "addall":
        print("Clean other extras")
        extra3=df['extra_claimReview_claimReviewed_entity'].copy()
        extra4=df['extra_claimReview_claimReviewed_categories'].copy()
        extra5=df['extra_keywords_entity'].copy()
        extra6=df['extra_keywords_categories'].copy()
        extra7=df['extra_tags'].copy()
        #extra8=df['extra_title'].copy()
        extra3 = fullclean_data(extra3, stop, lemm, False)
        extra4 = fullclean_data(extra4, stop, lemm, False)
        extra5 = fullclean_data(extra5, stop, lemm, False)
        extra6 = fullclean_data(extra6, stop, lemm, False)
        extra7 = fullclean_data(extra7, stop, lemm, False)
        #extra8 = fullclean_data(extra8, stop, lemm, False)

    if extraType == "addauthor" or extraType == "addall":
        print("Extras cleaned")
        
    dfResult = df[column].copy()
    dfResult = fullclean_data(dfResult, stop, lemm, True)
        
    if extraType == "addauthor" or extraType == "addall":
        print("Add author extras")
        dfResult=addtokens_2in1(dfResult,extra1)
        dfResult=addtokens_2in1(dfResult,extra2)
        del extra1, extra2

    if extraType == "addall":
        print("Add other extras")
        dfResult=addtokens_2in1(dfResult,extra3)
        dfResult=addtokens_2in1(dfResult,extra4)
        dfResult=addtokens_2in1(dfResult,extra5)
        dfResult=addtokens_2in1(dfResult,extra6)
        dfResult=addtokens_2in1(dfResult,extra7)
        #dfResult=addtokens_2in1(dfResult,extra8)
        del extra3, extra4, extra5, extra6, extra7 #, extra8

    if extraType == "addauthor" or extraType == "addall":
        print("Extras added")
        
    print(extraType,"done")
    
    return dfResult

In [None]:
includeExtra="addnone" # addnone addauthor addall

dfAjout1 = addExtras(dfOrigin, XColumnName, "addnone", StopWords, Lemmatiz)
print(dfAjout1.head())
print("dfAjout1 taille:",dfAjout1.shape,'\n')

print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)),"\n")

dfAjout2 = addExtras(dfOrigin, XColumnName, "addauthor", StopWords, Lemmatiz)
print(dfAjout2.head())
print("dfAjout2 taille:",dfAjout2.shape,'\n')

print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)),"\n")

dfAjout3 = addExtras(dfOrigin, XColumnName, "addall", StopWords, Lemmatiz)
print(dfAjout3.head())
print("dfAjout3 taille:",dfAjout3.shape,'\n')

print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)))

### Evaluation des changements suite aux ajouts

In [None]:
claimsMergeAjout1 = merge_tokens(dfAjout1.copy())
print("addnone")
scoreAjout1 = do_classifier(dfOrigin, yColumnName, claimsMergeAjout1, False)

claimsMergeAjout2 = merge_tokens(dfAjout2.copy())
print("addauthor")
scoreAjout2 = do_classifier(dfOrigin, yColumnName, claimsMergeAjout2, False)

claimsMergeAjout3 = merge_tokens(dfAjout3.copy())
print("addall")
scoreAjout3 = do_classifier(dfOrigin, yColumnName, claimsMergeAjout3, False)

In [None]:
def NextDf(dfO):

    dfN = dfO.copy()
    
    dfN = dfN.drop('creativeWork_author_name', 1)
    dfN = dfN.drop('extra_author_categories', 1)
    dfN = dfN.drop('extra_claimReview_claimReviewed_entity', 1)
    dfN = dfN.drop('extra_claimReview_claimReviewed_categories', 1)
    dfN = dfN.drop('extra_keywords_entity', 1)
    dfN = dfN.drop('extra_keywords_categories', 1)
    dfN = dfN.drop('extra_tags', 1)
    dfN = dfN.drop('extra_title', 1)

    return dfN

In [None]:
# Memorisation des parametres en fonction de l'évolution du score
if scoreAjout2>scoreAjout1:
    includeExtra = "addauthor"

if includeExtra == "addauthor":
    if scoreAjout3>scoreAjout2:
        includeExtra = "addall"
else:
    if scoreAjout3>scoreAjout1:
        includeExtra = "addall"

# Nouveau dataframe de travail
dfNext = NextDf(dfOrigin)
        
if includeExtra == "addnone":
    finalColumn = pd.DataFrame(claimsMergeAjout1)
    finalColumn.columns = [XColumnName]
    dfNext.update(finalColumn)

if includeExtra == "addauthor":
    finalColumn = pd.DataFrame(claimsMergeAjout2)
    finalColumn.columns = [XColumnName]
    dfNext.update(finalColumn)

if includeExtra == "addall":
    finalColumn = pd.DataFrame(claimsMergeAjout3)
    finalColumn.columns = [XColumnName]
    dfNext.update(finalColumn)
    
del dfOrigin, dfAjout1, dfAjout2, dfAjout3, claimsMergeAjout1, claimsMergeAjout2, claimsMergeAjout3

print("Apres Add:")
display(dfNext.head())
print("dfNext taille:",dfNext.shape,'\n')

gc.collect()
print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)))

## Etude sur le rééquilibrage du rating

In [None]:
print("Avant resampling:")
print(dfNext[yColumnName].value_counts())

### Creation des jeux de données resamplés

In [None]:
def resampleDf(df, column, method):
    
    if method == "noresampling":
        dfResult = df.copy()

    else:
        count_big, count_small = df[column].value_counts()
        ordered_val = df[column].value_counts().index.tolist()

        dfBig = df[df[column] == ordered_val[0]]
        dfSmall = df[df[column] == ordered_val[1]]

        if method == "downsampling": # from dataframe
            dfBigDown = dfBig.sample(count_small, random_state=20)
            dfResult = pd.concat([dfBigDown, dfSmall])
            del dfBigDown

        elif method == "upsampling": # from dataframe
            dfSmallUp = dfSmall.sample(count_big, replace=True, random_state=20)
            dfResult = pd.concat([dfBig, dfSmallUp])
            del dfSmallUp

        elif method == "downresampl": # from scikitlearn
            dfBigDown = resample(dfBig, n_samples=count_small, random_state=20)
            dfResult = pd.concat([dfBigDown, dfSmall])
            del dfBigDown

        elif method == "upresampl": # from scikitlearn
            dfSmallUp = resample(dfSmall, replace=True, n_samples=count_big, random_state=20)
            dfResult = pd.concat([dfBig, dfSmallUp])
            del dfSmallUp
            
        dfResult.to_csv('result.csv',sep='\t', index=False)
        dfResult=pd.read_csv('result.csv', sep='\t')
        del dfBig, dfSmall

    print("Apres",method,":")
    print(dfResult[column].value_counts())
    
    return dfResult

In [None]:
methodResampl = "noresampling" # noresampling downsampling upsampling downresampl upresampl
methodes=["noresampling", "downsampling", "upsampling", "downresampl", "upresampl"]
dfResample = []

for i in range(len(methodes)):
    dfResample.append(resampleDf(dfNext, yColumnName, methodes[i]))
    print("dfResample",i,"taille:",dfResample[i].shape,"\n")
    print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)),"\n")


### Evaluation des changements suite aux resamples

In [None]:
scores = []

for i in range(len(methodes)):
    print(methodes[i])
    scores.append(do_classifier(dfResample[i], yColumnName, dfResample[i][XColumnName].copy(), False))

In [None]:
bestmean = 0
index = 0
for i in range(len(scores)):
    if scores[i] >= bestmean:
        methodResampl = methodes[i]
        bestmean = scores[i]
        index = i

dfFinal = dfResample[index].copy()

del dfNext, dfResample

print("Apres Resample:")
display(dfFinal.head())
print("dfFinal taille:",dfFinal.shape,'\n')

gc.collect()
print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)))

## Recherche des meilleurs paramètres des classifieurs

In [None]:
def log_results(fileName, stop, lemm, addExtra, reSample, clf, param, score, t):
    fichier = open("log.txt", "a")
    fichier.write("\nJeu de données: %s\n" % (fileName))
    if stop: s="True"
    else: s="False"
    fichier.write("Stop words: %s\n" % (s))
    if lemm: l="True"
    else: l="False"
    fichier.write("Lemmatisation: %s\n" % (l))
    fichier.write("Ajout d'extras: %s\n" % (addExtra))
    fichier.write("Méthode resample: %s\n" % (reSample))
    fichier.write("Classifieur: %s\n" % (clf))
    fichier.write("Paramètres: %s\n" % (param))
    fichier.write("Score: %.3f%%\n" % (score * 100.0))
    fichier.write("Réalisé en %.1fs\n" % (t))
    fichier.close()

In [None]:
def do_gridsearch(name, estimClf, gridParam, X, y):

    print(name,": wait...")
    t0 = time()
    scoring = 'accuracy'

    gd_sr = GridSearchCV(estimator=estimClf,  
                        param_grid=gridParam,
                        scoring=scoring,
                        cv=5,
                        #n_jobs=-1,
                        iid=True,
                        return_train_score=True)

    gd_sr.fit(X, y)

    log_results(dataFile, StopWords, Lemmatiz, includeExtra, methodResampl,
                  name, gd_sr.best_params_, gd_sr.best_score_,  time() - t0)
    
    print ("Meilleurs paramètres: %s" % (gd_sr.best_params_))
    print ("Meilleur score: %.3f%%" % (gd_sr.best_score_ * 100.0))
    print ("Réalisé en %.1fs\n" % (time() - t0))
    print ("Meilleur estimateur",gd_sr.best_estimator_,'\n')
    
    gc.collect()
    print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)),"\n")    
    
    return gd_sr.best_params_    

In [None]:
jobsTodo= [
    {
        'clf' : 'GaussianNB',
        'clfAbrev' : 'GNB',
        'estimClf' : GaussianNB(),
        'grid_param' : {}
    },
    {
        'clf' : 'MultinomialNB',
        'clfAbrev' : 'MNB',
        'estimClf' : MultinomialNB(),
        'grid_param' : {}
    },
    {
        'clf' : 'LinearSVC',
        'clfAbrev' : 'LSVC',
        'estimClf' : LinearSVC(),
        'grid_param' : {
            'C': [0.01, 0.1, 0.4, 0.5, 0.6, 1, 9, 10, 11, 100],
            'max_iter': [1000]
        }
    },
    {
        'clf' : 'LogisticRegression',
        'clfAbrev' : 'LR',
        'estimClf' : LogisticRegression(),
        'grid_param' : {
            'C' : [2,3,4,5,6,7,8],
            'max_iter': [500, 1000]
        },
    },
    {
        'clf' : 'DecisionTreeClassifier',
        'clfAbrev' : 'DTC',
        'estimClf' : DecisionTreeClassifier(),
        'grid_param' : {  
            'max_depth': [7,8,9,10,11,12],
            'criterion': ['gini', 'entropy'],
            'min_samples_leaf': [2,3,4,5,6,7,8]
        }
    },
    {
        'clf' : 'RandomForestClassifier',
        'clfAbrev' : 'RFC',
        'estimClf' : RandomForestClassifier(),
        'grid_param' : {
            'criterion': ['entropy', 'gini'],
            'max_depth': [6, 9, 12], 
            'max_features': ['log2', 'sqrt','auto'], 
            'min_samples_leaf': [1, 5, 8],
            'min_samples_split': [2, 3, 5],
            'n_estimators': [6, 9, 12]
        }
    },
    {
        'clf' : 'SGDClassifier',
        'clfAbrev' : 'SGDC',
        'estimClf' : SGDClassifier(),
        'grid_param' : {
             'loss': ['log','hinge'],
             'penalty': ['l1','l2']
        }
    }
]

In [None]:
print("Preparation aux classifieurs...")

vectorizerT = TfidfVectorizer(min_df=2)
vectorT = vectorizerT.fit_transform(dfFinal[XColumnName].copy())

X=vectorT.toarray()
y=dfFinal[yColumnName].copy()

validation_size=0.25
testsize= 1-validation_size
X_train,X_test,y_train,y_test=train_test_split(X, 
                                               y, 
                                               train_size=validation_size, 
                                               random_state=20,
                                               test_size=testsize)

gc.collect()
print("freememory=%2.3f Go" %(psutil.virtual_memory().free/(1024 * 1024 * 1024)),"\n")

models = []
for j in jobsTodo:
    params = do_gridsearch(j['clf'], j['estimClf'], j['grid_param'], X_train, y_train)
    models.append((j['clf'], j['clfAbrev'], j['estimClf'], params))
    
del vectorT, X_train, X_test, y_train, y_test

## Recherche du meilleur classifieur paramétré

In [None]:
results = []
abrevs = []
scores = []
scoring = 'accuracy'
print("wait...\n")
for name,abrev,model,param in models:
    model.set_params(**param)
    kfold = KFold(n_splits=5, shuffle=True, random_state=3)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    abrevs.append(abrev)
    scores.append((name,model,param,cv_results.mean()))
    msg = "%s: %.3f%% (%.3f)" % (name, cv_results.mean()*100, cv_results.std())
    print(msg)
gc.collect()
print("\nfreememory=%2.3f Go" % (psutil.virtual_memory().free/(1024 * 1024 * 1024)))

In [None]:
fig = plt.figure()
fig.suptitle('Comparaison des classifieurs')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(abrevs)

clfName = ""
bestParam = ""
bestmean = 0
for name,model,param,mean in scores:
    if mean > bestmean:
        clfName = name
        clfMethod = model
        bestParam = param
        bestmean = mean

msg = "Meilleur résultat: %s(%s) score:%.3f%%" %(clfName, bestParam, bestmean*100)
print(msg)

## Evaluation finale avec l'ensemble des meilleurs choix retenus

In [None]:
def perform_final():
    t0 = time()
    dfOrigin=pd.read_csv(dataFile, sep='\t')

    # ! necessaire pour mixture, a mettre en commentaire pour truefalse !
    #dfOrigin.loc[dfOrigin[yColumnName] == -1, yColumnName] = 1

    dfAjout = addExtras(dfOrigin, XColumnName, includeExtra, StopWords, Lemmatiz)
    claimsMerge = merge_tokens(dfAjout.copy())
    dfNext = NextDf(dfOrigin)
    finalColumn = pd.DataFrame(claimsMerge)
    finalColumn.columns = [XColumnName]
    dfNext.update(finalColumn)
    dfFinal = resampleDf(dfNext, yColumnName, methodResampl)
    vectorizerT = TfidfVectorizer(min_df=2)
    vectorT = vectorizerT.fit_transform(dfFinal[XColumnName].copy())
    X=vectorT.toarray()
    y=dfFinal[yColumnName].copy()
    kfold = KFold(n_splits=5, shuffle=True, random_state=3)
    clfMethod.set_params(**bestParam)
    scoring = 'accuracy'
    cv_results = cross_val_predict(clfMethod, X, y, cv=kfold)
    log_results(dataFile, StopWords, Lemmatiz, includeExtra, methodResampl,
                  clfName, bestParam, cv_results.mean(),  time() - t0)

    accuracy = accuracy_score(y, cv_results)
    precision,recall,fscore,support=score(y,cv_results,average='macro')

    print("Accuracy: %.3f%%" % (accuracy * 100.0))
    print("F1-score moyen: %.3f%%\n" % (fscore * 100.0))
    print ('Matrice de confusion:\n', confusion_matrix(y, cv_results),'\n')
    print (classification_report(y, cv_results))


In [None]:
print("dataFile= '%s'" % (dataFile)) # data-truefalse-1.csv data-truefalse-2.csv data-truefalse-3.csv data-truefalse.csv
print("XColumnName= '%s'" % (XColumnName))
print("yColumnName= '%s'" % (yColumnName))
print("StopWords=",StopWords) # False True
print("Lemmatiz=",Lemmatiz)  # False True
print("includeExtra= '%s'" % (includeExtra)) # addnone addauthor addall
print("methodResampl= '%s'" % (methodResampl)) # noresampling downsampling upsampling downresampl upresampl
print("clfName= '%s'" % (clfName))
print("clfMethod= ",clfMethod)
print("bestParam= ",bestParam,"\n")
perform_final()

### Save des datas

In [None]:
dfFinal.to_csv('result.csv',sep='\t', index=False)
print("Save to CSV")

## Fin

In [None]:
assert False, "breakpoint"

#### Sauvegarde des resultats

In [None]:
dataFile= 'data-truefalse.csv'
XColumnName= 'claimReview_claimReviewed'
yColumnName= 'true_false_mixture'
StopWords= False
Lemmatiz= False
includeExtra= 'addauthor'
methodResampl= 'upresampl'
clfName= 'LogisticRegression'
clfMethod= LogisticRegression()
bestParam= {'C': 8, 'max_iter': 500}
perform_final() # pour relancer
#Score= 84.014% (0.004)

In [None]:
dataFile= 'data-mixture.csv'
XColumnName= 'claimReview_claimReviewed'
yColumnName= 'true_false_mixture'
StopWords= False
Lemmatiz= False
includeExtra= 'addauthor'
methodResampl= 'upresampl'
clfName= 'LogisticRegression'
clfMethod= LogisticRegression()
bestParam=  {'C': 4, 'max_iter': 500}
perform_final() # pour relancer
#Score= 77.407% (0.002)