# Supervised Machine Learning Project
## CUSSON Thomas - MARMORET Axel

In [None]:
# All the needed imports
from os import listdir
from os.path import isfile, join
from sklearn.feature_extraction.text import CountVectorizer
import random
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import matplotlib.pyplot as plt
import scikitplot as skplt

# The differents algorithms
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Some important variables
listResults = []

## Load the data

In [None]:
reviewList = []

mydir = "../Datasets/petit/txt_sentoken/" # Could need to change the path

# Load the positive examples 
for txt in listdir(mydir + "pos/"):
    if isfile(join(mydir + "pos/", txt)):
        review = open(join(mydir + "pos/", txt), "r")
        reviews = " ".join(review.readlines()).replace("\n", " ")
        reviewList.append((reviews, 0))
        
# Load the negative examples
for txt in listdir(mydir + "neg/"):
    if isfile(join(mydir + "neg/", txt)):
        review = open(join(mydir + "neg/", txt), "r")
        reviews = " ".join(review.readlines()).replace("\n", " ")
        reviewList.append((reviews, 1))
        

## Pre-format the data

In [None]:
# Shuffle the positive and negative examples
random.shuffle(reviewList)

# Use first 70% for training
train_size =int(0.7 * len(reviewList))
train_set, test_set = reviewList[:train_size], reviewList[train_size:]
        
# Separate the datasets in usable tables
train_set_unlabeled,train_labels,test_set_unlabeled,test_labels = [], [], [], []

for i in range(len(train_set)) :
    train_set_unlabeled.append(train_set[i][0])
    train_labels.append(train_set[i][1])
for i in range(len(test_set)) :
    test_set_unlabeled.append(test_set[i][0])
    test_labels.append(test_set[i][1])

# Machine Learning algorithms

## Naive Bayes

### Build a pipeline with some parameters

In [None]:
# Pipeline of action specific for the Naive Bayes algorithm
pipeline_nb = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=15, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('nb', MultinomialNB()),
                    ])

# NB : We have run our algorithm with min_df as a parameter in order to find its best value
# It gaves us 16 in range (1,51,5) and 15 in range (13, 19)
# This parameter exploded our computation time, so we decided to fixed it at 15 in order to run our algorithms in decent time

# GridSearch allows us to test severals parameters
parameters_nb = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'tfidf__use_idf': (True, False),
                 'nb__alpha': (1e-1, 1e-2),
}

GridSearchNaiveBayse = GridSearchCV(pipeline_nb, parameters_nb, n_jobs=-1, cv = 3)

### Run the pipeline !

In [None]:
GridSearchNaiveBayse = GridSearchNaiveBayse.fit(train_set_unlabeled, train_labels)

### Analysis of the results : searching for the best parameters

In [None]:
for param_name in sorted(parameters_nb.keys()):
    print("%s: %r" % (param_name, GridSearchNaiveBayse.best_params_[param_name]))

print("Best Estimator : " + str(GridSearchNaiveBayse.best_estimator_.get_params()["nb"]))

print ("Best Score on those conditions : " + str(GridSearchNaiveBayse.best_score_))

In [None]:
naive = GridSearchNaiveBayse.best_estimator_

test_bayse_prediction = naive.predict(test_set_unlabeled)

In [None]:
print("Accuracy : " + str(metrics.accuracy_score(test_bayse_prediction, test_labels)))
print("Recall : " + str(metrics.recall_score(test_bayse_prediction, test_labels)))
print("Precision : " + str(metrics.precision_score(test_bayse_prediction, test_labels)))

skplt.metrics.plot_confusion_matrix(test_labels, test_bayse_prediction, normalize=True)
plt.show()
#Si ça plante, installer matplotlib (https://scikit-plot.readthedocs.io/en/stable/Quickstart.html)

In [None]:
print("ROC_Curve : " + str(metrics.roc_auc_score(test_bayse_prediction, test_labels)))

#plt.hist([((metrics.roc_auc_score(test_bayse_prediction, test_labels))), ((metrics.recall_score(test_bayse_prediction, test_labels)))])
#plt.ylabel('some numbers')
#plt.show()

In [None]:
words = naive.get_params()["vect"].get_feature_names()

# number of times each word appears across all positive messages
pos_word_count = naive.get_params()["nb"].feature_count_[0, :]

neg_word_count = naive.get_params()["nb"].feature_count_[1, :]

words_frequencies = pd.DataFrame({"word":words, "positive":pos_word_count, "negative":neg_word_count}).set_index("word")
words_frequencies.head()

words_frequencies.sort_values('positive', ascending=False)

## SVM

### Same method

In [None]:
pipeline_svm = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=25, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('svc', SVC()),
                    ])

parameters_svm = {'tfidf__use_idf': (True, False),
                  'vect__ngram_range': [(1, 1), (1, 2)],
                  'svc__kernel': ['linear','rbf'],
                  'svc__gamma': [0.1, 0.01],
                  'svc__C': [1, 10, 100],
}

GridSearchSVM = GridSearchCV(pipeline_svm, parameters_svm, n_jobs=-1)

GridSearchSVM.fit(train_set_unlabeled, train_labels)

In [None]:
# Get the best parameters for the SVM
for param_name in sorted(parameters_svm.keys()):
    print("%s: %r" % (param_name, GridSearchSVM.best_params_[param_name]))

print(" Best Estimator : " + str(GridSearchSVM.best_estimator_.get_params()["svc"]))

svm = GridSearchSVM.best_estimator_

test_svm_prediction = svm.predict(test_set_unlabeled)

print("Accuracy : " + str(metrics.accuracy_score(test_svm_prediction, test_labels)))
print("Recall : " + str(metrics.recall_score(test_svm_prediction, test_labels)))
print("Precision : " + str(metrics.precision_score(test_svm_prediction, test_labels)))

skplt.metrics.plot_confusion_matrix(test_labels, test_svm_prediction, normalize=True)
plt.show()

## Random Forest 

In [None]:
pipeline_rf = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=25, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('rf', RandomForestClassifier()),
                    ])

parameters_rf = {'tfidf__use_idf': (True, False),
                 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'rf__n_estimators': [100, 1000, 2000],
}

GridSearchRF = GridSearchCV(pipeline_rf, parameters_rf, n_jobs=-1)

GridSearchRF.fit(train_set_unlabeled, train_labels)

In [None]:
for param_name in sorted(parameters_rf.keys()):
    print("%s: %r" % (param_name, GridSearchRF.best_params_[param_name]))

print(" Best Estimator : " + str(GridSearchRF.best_estimator_.get_params()["rf"]))

In [None]:
rf = GridSearchRF.best_estimator_

test_rf_prediction = rf.predict(test_set_unlabeled)

print("Accuracy : " + str(metrics.accuracy_score(test_rf_prediction, test_labels)))
print("Recall : " + str(metrics.recall_score(test_rf_prediction, test_labels)))
print("Precision : " + str(metrics.precision_score(test_rf_prediction, test_labels)))

skplt.metrics.plot_confusion_matrix(test_labels, test_rf_prediction, normalize=True)
plt.show()

## Gradient Boosting

In [None]:
pipeline_gb = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=25, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('gb', GradientBoostingClassifier()),
                    ])

parameters_gb = {'tfidf__use_idf': (True, False),
                 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'gb__learning_rate': [1e-1, 1e-2, 1e-3],
                 'gb__n_estimators': [100, 1000, 10000],
}

GridSearchGB = GridSearchCV(pipeline_gb, parameters_gb, n_jobs=-1, scoring=)

GridSearchGB.fit(train_set_unlabeled, train_labels)

In [None]:
for param_name in sorted(parameters_rf.keys()):
    print("%s: %r" % (param_name, GridSearchRF.best_params_[param_name]))

print(" Best Estimator : " + str(GridSearchRF.best_estimator_.get_params()["gb"]))

In [None]:
# Get the best parameters for the SVM
gb = GridSearchGB.best_estimator_

test_gb_prediction = gb.predict(test_set_unlabeled)

print("Accuracy : " + str(metrics.accuracy_score(test_gb_prediction, test_labels)))
print("Recall : " + str(metrics.recall_score(test_gb_prediction, test_labels)))
print("Precision : " + str(metrics.precision_score(test_gb_prediction, test_labels)))

skplt.metrics.plot_confusion_matrix(test_labels, test_gb_prediction, normalize=True)
plt.show()

# Compare the results

In [None]:
listResults.sort(reverse=True)

In [None]:
listResults

In [None]:
# Rajouter Precision et Rappel a minima

# Matrice de confusion
metrics.confusion_matrix(y_test, y_pred_class)

# ROC
metrics.roc_auc_score(y_test, y_pred_prob)

# Voir section "Examining a model for further insight" du prof, tokenisation par ex

# Dessiner les résultats (histogramme avec tous les scores par exemple)
