In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os 
import re
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [None]:
col = ["File_Name","Auteur", "Texte"]
df= pd.DataFrame(columns=col)
print(df)

In [None]:
chemin= os.getcwd() + '\\training_data\\'
chemin=str(chemin)

In [None]:
datas=[]
file_dir=[]
file_name=[]

for file in os.listdir(chemin):
    file_name.append(file)
    file_dir= chemin+str(file)
    with open(file_dir, 'r') as f:
        data = f.read()
        datas.append(data)
        f.close()
print(datas)
print(file_name)
df=pd.DataFrame({"Texte":[], "File_Name":[]})


In [None]:
df["Texte"]=datas
df["File_Name"]=file_name
print(df)


In [None]:
pattern= "[A-Z]{3}[.]+txt$"
search = []    
for values in df["File_Name"]:
    search.append(re.search(r'[A-Z]{3}', values).group())
df['Auteur'] = search


In [None]:
#Modèle entrainé le plus léger et suffisant ici. Regarder si md ou lrg à meilleur ratio time_process/accurancy
nlp = spacy.load("en_core_web_sm")


In [None]:
#Code plus efficient afin de tokeniser, lemmatiser et rajouter les POS. 
# Ne pas oublier de rajouter les stopwords!!!!!
tokens = []
lemma = []
pos = []
stop_words= []
ent= []

for doc in nlp.pipe(df['Texte'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
        stop_words.append([n.text for n in doc if not n.is_stop])
        ent.append([e.label_ for e in doc.ents])

    else:
        # Ajouter des blancs si erreur pour avoir le même nombre d'entrées
        lemma.append(None)
        pos.append(None)
        stop_words.append(None)
        ent.append(None)

df['Tokens'] = tokens
df['Tokens_NoStopW']= stop_words
df['lemma'] = lemma
df['PartOfSpeech'] = pos
df['ent'] = ent

DEBUT DE L'EXPLORATION

In [None]:
auteur = df.groupby("Auteur")
auteur.describe().head()


Comptage des occurences des mots :

In [None]:
df['clean_text']=str()
for i, row in df.iterrows():
    row['clean_text']=' '.join(row['lemma'])
    

df['clean_text'] = df.clean_text.replace("[PRON\s\W]", " ",regex=True)
df['clean_text'] = df.clean_text.replace(' +',' ',regex=True)
df['clean_text'] = df.clean_text.replace('^ ','',regex=True)

#df = pd.get_dummies(df, columns=['Auteur'])
df['Auteur_number']=df['Auteur'].map({'EAP': 1, 'HPL':2, 'MWS':3})


In [None]:
df_finale=df[['Auteur_number','clean_text','ent']]

In [None]:
y = df['Auteur_number']
#X=df_finale.drop(['Auteur_number'], axis=1)
X= df['Texte']
print(X)
print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
auteurs=['EAP','HPL', 'MWS']

In [None]:
#On va transformer les mots en vecteurs sur base de tfidf
tfv = TfidfVectorizer(min_df=3, analyzer='word',ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)
tfv.fit(list(X_train) + list(X_test))
X_train_tfv =  tfv.transform(X_train) 
X_test_tfv = tfv.transform(X_test)

#On fait la même chose mais sur base de comptage des mots
ctv = CountVectorizer(analyzer='word',ngram_range=(1, 3))
ctv.fit(list(X_train) + list(X_test))
X_train_ctv =  ctv.transform(X_train) 
X_test_ctv = ctv.transform(X_test)



In [None]:
#Test avec une Multinomial regression
nb = MultinomialNB()
nb.fit(X_train_ctv, y_train)
y_pred_NB = nb.predict(X_test_ctv)
print('accuracy %s' % accuracy_score(y_pred_NB, y_test))
print(classification_report(y_test, y_pred_NB,target_names=auteurs))

scores = cross_val_score(nb, X_test_ctv, y_test, cv=10)
print("Cross Val Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
#Test avec une LinearSVC
svc = LinearSVC()
svc.fit(X_train_ctv, y_train)
y_pred_svc = svc.predict(X_test_ctv)
print('accuracy %s' % accuracy_score(y_pred_svc, y_test))
print(classification_report(y_test, y_pred_svc,target_names=auteurs))

scores = cross_val_score(svc, X_test_ctv, y_test, cv=10)
print("Cross Val Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
#Test avec GradientBoost
gb = GradientBoostingClassifier(max_depth=6, n_estimators=10, random_state=2)
gb.fit(X_train_ctv,y_train)
y_pred_boost = gb.predict(X_test_ctv)
print('accuracy %s' % accuracy_score(y_pred_boost, y_test))
print(classification_report(y_test, y_pred_boost,target_names=auteurs))

scores = cross_val_score(gb, X_test_ctv, y_test, cv=10)
print("Cross Val Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
#Test avec KNN
kn = KNeighborsClassifier(n_neighbors=1)
kn.fit(X_train_ctv,y_train)
y_pred_kn = kn.predict(X_test_ctv)
print('accuracy %s' % accuracy_score(y_pred_kn, y_test))
print(classification_report(y_test, y_pred_kn,target_names=auteurs))

scores = cross_val_score(kn, X_test_ctv, y_test, cv=10)
print("Cross Val Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
#Test avec SVM
sv = SGDClassifier()
sv.fit(X_train_ctv,y_train)
y_pred_svm = sv.predict(X_test_ctv)
print('accuracy %s' % accuracy_score(y_pred_svm, y_test))
print(classification_report(y_test, y_pred_svm,target_names=auteurs))

scores = cross_val_score(sv, X_test_ctv, y_test, cv=10)
print("Cross Val Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
#Test avec Beroulli
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train_ctv,y_train)
y_pred_bnb = bnb.predict(X_test_ctv)
print('accuracy %s' % accuracy_score(y_pred_bnb, y_test))
print(classification_report(y_test, y_pred_bnb,target_names=auteurs))

scores = cross_val_score(bnb, X_test_ctv, y_test, cv=10)
print("Cross Val Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.naive_bayes import CategoricalNB
cnb = SGDClassifier()
cnb.fit(X_train_ctv,y_train)
y_pred_cnb = cnb.predict(X_test_ctv)
print('accuracy %s' % accuracy_score(y_pred_cnb, y_test))
print(classification_report(y_test, y_pred_cnb,target_names=auteurs))

scores = cross_val_score(cnb, X_test_ctv, y_test, cv=10)
print("Cross Val Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
#Test avec RandomForest

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, min_samples_leaf=0.0001)
rf.fit(X_train_ctv,y_train)
y_pred_rfo = rf.predict(X_test_ctv)
print('accuracy %s' % accuracy_score(y_pred_rfo, y_test))
print(classification_report(y_test, y_pred_rfo,target_names=auteurs))

scores = cross_val_score(rf, X_test_ctv, y_test, cv=10)
print("Cross Val Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
#Test avec Logistic regression
lr = LogisticRegression()
lr.fit(X_train_ctv,y_train)
y_pred_lrg = lr.predict(X_test_ctv)
print('accuracy %s' % accuracy_score(y_pred_lrg, y_test))
print(classification_report(y_test, y_pred_lrg,target_names=auteurs))

scores = cross_val_score(lr, X_test_ctv, y_test, cv=10)
print("Cross Val Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
import xgboost as xgb
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(X_train_ctv.tocsc(), y_train)
y_pred = clf.predict(X_test_ctv)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

scores = cross_val_score(clf, X_test_ctv, y_test, cv=10)
print("Cross Val Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))