# Supervised Machine Learning Project
## CUSSON Thomas - MARMORET Axel

In [None]:
# All the needed imports
from os import listdir
from os.path import isfile, join
from sklearn.feature_extraction.text import CountVectorizer
import random
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV

# The differents algorithms
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Some important variables
listResults = []

## Load the data

In [None]:
reviewList = []

mydir = "../Datasets/petit/txt_sentoken/" # Could need to change the path

# Load the positive examples 
for txt in listdir(mydir + "pos/"):
    if isfile(join(mydir + "pos/", txt)):
        review = open(join(mydir + "pos/", txt), "r")
        reviews = " ".join(review.readlines()).replace("\n", " ")
        reviewList.append((reviews, 0))
        
# Load the negative examples
for txt in listdir(mydir + "neg/"):
    if isfile(join(mydir + "neg/", txt)):
        review = open(join(mydir + "neg/", txt), "r")
        reviews = " ".join(review.readlines()).replace("\n", " ")
        reviewList.append((reviews, 1))
        

## Pre-format the data

In [None]:
# Shuffle the positive and negative examples
random.shuffle(reviewList)

# Use first 70% for training
train_size =int(0.7 * len(reviewList))
train_set, test_set = reviewList[:train_size], reviewList[train_size:]
        
vect_sw = CountVectorizer(stop_words='english', min_df=25, analyzer = "word", ngram_range = (1, 1)) # Jouer avec les valeurs de min et max df

# Separate the datasets in usable tables
train_set_unlabeled,train_labels,test_set_unlabeled,test_labels = [], [], [], []

for i in range(len(train_set)) :
    train_set_unlabeled.append(train_set[i][0])
    train_labels.append(train_set[i][1])
for i in range(len(test_set)) :
    test_set_unlabeled.append(test_set[i][0])
    test_labels.append(test_set[i][1])

# Machine Learning algorithms

## Dummy Classifier (as a comparison tool)

In [None]:
dummy = DummyClassifier(strategy='stratified', random_state=None)

dummy = dummy.fit(train_set_unlabeled, train_labels)

pred = dummy.predict(test_set_unlabeled)

print("Score : " + str(dummy.score(pred, test_labels)))

## Naive Bayes

### Build a pipeline with some parameters

In [None]:
# Pipeline of action specific for the Naive Bayes algorithm
pipeline_nb = Pipeline([('vect', CountVectorizer(stop_words='english', analyzer = "word")), #min_df=25,
                     ('tfidf', TfidfTransformer()),
                     ('nb', MultinomialNB()),
                    ])

# GridSearch allows us to test severals parameters
parameters_nb = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 #'vect__min_df':range(14,19), #range(1,51,5) donne 16
                 'tfidf__use_idf': (True, False),
                 'nb__alpha': (1e-1, 1e-2),
}

GridSearchNaiveBayse = GridSearchCV(pipeline_nb, parameters_nb, n_jobs=-1)

### Run the pipeline !

In [None]:
GridSearchNaiveBayse = GridSearchNaiveBayse.fit(train_set_unlabeled, train_labels)

GridSearchNaiveBayse.predict(test_set_unlabeled)

### Best accuracy score

In [None]:
print ("Best Score : " + str(GridSearchNaiveBayse.best_score_))

listResults.append(GridSearchNaiveBayse.best_score_)

### Analysis of the results : searching for the best parameters

In [None]:
for param_name in sorted(parameters_nb.keys()):
    print("%s: %r" % (param_name, GridSearchNaiveBayse.best_params_[param_name]))

print(" Best Estimator : " + str(GridSearchNaiveBayse.best_estimator_.get_params()["nb"]))

## SVM

In [None]:
pipeline_svm = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=25, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('svc', SVC()),
                    ])

parameters_svm = {'tfidf__use_idf': (True, False),
                  'vect__ngram_range': [(1, 1), (1, 2)],
                  'svc__kernel': ['linear','rbf'],
                  'svc__gamma': [0.1, 0.01],
                  'svc__C': [1, 10, 100],
}

GridSearchSVM = GridSearchCV(pipeline_svm, parameters_svm, n_jobs=-1)

GridSearchSVM.fit(train_set_unlabeled, train_labels)

GridSearchSVM.predict(test_set_unlabeled)

print("Best Score : " + str(GridSearchSVM.best_score_))

listResults.append(GridSearchSVM.best_score_)

# Get the best parameters for the SVM
for param_name in sorted(parameters_svm.keys()):
    print("%s: %r" % (param_name, GridSearchSVM.best_params_[param_name]))

print(" Best Estimator : " + str(GridSearchSVM.best_estimator_.get_params()["svc"]))

## Random Forest 

In [None]:
pipeline_rf = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=25, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('rf', RandomForestClassifier()),
                    ])

parameters_rf = {'tfidf__use_idf': (True, False),
                 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'rf__n_estimators': [100, 1000, 2000],
}

GridSearchRF = GridSearchCV(pipeline_rf, parameters_rf, n_jobs=-1)

GridSearchRF.fit(train_set_unlabeled, train_labels)

GridSearchRF.predict(test_set_unlabeled)

print("Best Score : " + str(GridSearchRF.best_score_))

listResults.append(GridSearchRF.best_score_)

for param_name in sorted(parameters_rf.keys()):
    print("%s: %r" % (param_name, GridSearchRF.best_params_[param_name]))

print(" Best Estimator : " + str(GridSearchRF.best_estimator_.get_params()["rf"]))

## Gradient Boosting

In [None]:
pipeline_gb = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=25, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('gb', GradientBoostingClassifier()),
                    ])

parameters_gb = {'tfidf__use_idf': (True, False),
                 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'gb__learning_rate': [1e-1, 1e-2, 1e-3],
                 'gb__n_estimators': [100, 1000, 10000],
}

GridSearchGB = GridSearchCV(pipeline_gb, parameters_gb, n_jobs=-1, scoring=)

GridSearchGB.fit(train_set_unlabeled, train_labels)

GridSearchGB.predict(test_set_unlabeled)

print("Best Score : " + str(GridSearchRF.best_score_))

for param_name in sorted(parameters_rf.keys()):
    print("%s: %r" % (param_name, GridSearchRF.best_params_[param_name]))

print(" Best Estimator : " + str(GridSearchRF.best_estimator_.get_params()["gb"]))

# Compare the results

In [None]:
listResults.sort(reverse=True)

In [None]:
listResults

In [None]:
# Rajouter Precision et Rappel a minima

# Matrice de confusion
metrics.confusion_matrix(y_test, y_pred_class)

# ROC
metrics.roc_auc_score(y_test, y_pred_prob)

# Voir section "Examining a model for further insight" du prof, tokenisation par ex

# Dessiner les résultats (histogramme avec tous les scores par exemple)

import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y, predictions, normalize=True)
plt.show()


import matplotlib.pyplot as plt
plt.plot(listResults)
plt.ylabel('some numbers')
plt.show()

from sklearn.metrics import recall_score

print(recall_score(test_labels, prediction, average='macro'))