# Supervised Machine Learning Project
## CUSSON Thomas - MARMORET Axel

In [None]:
# All the needed imports
from os import listdir
from os.path import isfile, join
from sklearn.feature_extraction.text import CountVectorizer
import random
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.model_selection import KFold

# The differents algorithms
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

## Load the data

In [None]:
reviewList = []

mydir = "../Datasets/petit/txt_sentoken/" # Could need to change the path

# Load the positive examples 
for txt in listdir(mydir + "pos/"):
    if isfile(join(mydir + "pos/", txt)):
        review = open(join(mydir + "pos/", txt), "r")
        reviews = " ".join(review.readlines()).replace("\n", " ")
        reviewList.append((reviews, 0))
        
# Load the negative examples
for txt in listdir(mydir + "neg/"):
    if isfile(join(mydir + "neg/", txt)):
        review = open(join(mydir + "neg/", txt), "r")
        reviews = " ".join(review.readlines()).replace("\n", " ")
        reviewList.append((reviews, 1))
        

## Pre-format the data

In [None]:
# Shuffle the positive and negative examples
random.seed(42)
random.shuffle(reviewList)

# Use first 70% for training
train_size =int(0.7 * len(reviewList))
train_set, test_set = reviewList[:train_size], reviewList[train_size:]
        
# Split the datasets in usable tables
train_set_unlabeled,train_labels,test_set_unlabeled,test_labels = [], [], [], []

for i in range(len(train_set)) :
    train_set_unlabeled.append(train_set[i][0])
    train_labels.append(train_set[i][1])
for i in range(len(test_set)) :
    test_set_unlabeled.append(test_set[i][0])
    test_labels.append(test_set[i][1])

# Machine Learning algorithms

## Naive Bayes

### Test different parameters

In [None]:
# Pipeline of action specific for the Naive Bayes algorithm
pipeline_nb = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=0.01, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('nb', MultinomialNB()),
                    ])

# NB : We have run our algorithm with min_df as a parameter in order to find its best value
# It gaves us 16 in range (1,51,5) and 15 in range (13, 19)
# This parameter exploded our computation time,
# so we decided to fixed it at 1% of our training dataset size (14 here) in order to run our algorithms in decent time

# GridSearch allows us to test severals parameters
parameters_nb = {'vect__ngram_range': [(1, 1), (1, 2)],
                 'tfidf__use_idf': (True, False),
                 'nb__alpha': (1e-1, 1e-2),
}

GridSearchNaiveBayse = GridSearchCV(pipeline_nb, parameters_nb, n_jobs=-1, cv=KFold(n_splits=5, random_state=0), return_train_score=True)

### Run the test pipeline !

In [None]:
GridSearchNaiveBayse = GridSearchNaiveBayse.fit(train_set_unlabeled, train_labels)

### Results from the different parameters

In [None]:
results_naive = GridSearchNaiveBayse.cv_results_

pd.DataFrame(results_naive).head()

In [None]:
comparison_results = pd.DataFrame({"rank":results_naive['rank_test_score'],
                                  "alpha":results_naive["param_nb__alpha"],
                                  "tfidf":results_naive["param_tfidf__use_idf"],
                                  "n_gram":results_naive["param_vect__ngram_range"],
                                  "mean_test_score":results_naive["mean_test_score"],
                                  "mean_train_score":results_naive["mean_train_score"]}).set_index("rank")

comparison_results.sort_values('rank', ascending=True)

### Looking at the best parameters

In [None]:
for param_name in sorted(parameters_nb.keys()):
    print("%s: %r" % (param_name, GridSearchNaiveBayse.best_params_[param_name]))

print("Best Estimator : " + str(GridSearchNaiveBayse.best_estimator_.get_params()["nb"]))

print("Best Score on those conditions : " + str(GridSearchNaiveBayse.best_score_))

### Keeping the best estimator according to the parameters

In [None]:
naive = GridSearchNaiveBayse.best_estimator_

### Test the prediction on our test set

In [None]:
test_bayse_prediction = naive.predict(test_set_unlabeled)

print("Accuracy : " + str(metrics.accuracy_score(test_bayse_prediction, test_labels)))

skplt.metrics.plot_confusion_matrix(test_labels, test_bayse_prediction, normalize=True)
plt.show()

print("Recall : " + str(metrics.recall_score(test_bayse_prediction, test_labels)))
print("Precision : " + str(metrics.precision_score(test_bayse_prediction, test_labels)))
print("Under ROC curve area : " + str(metrics.roc_auc_score(test_bayse_prediction, test_labels)))

## SVM

### We know use the same operating procedure

In [None]:
# Fitting
pipeline_svm = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=0.01, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('svc', SVC()),
                    ])

parameters_svm = {'tfidf__use_idf': (True, False),
                  'vect__ngram_range': [(1, 1), (1, 2)],
                  'svc__kernel': ['linear','rbf'],
                  'svc__gamma': [0.1, 0.01],
                  'svc__C': [10, 100],
}

GridSearchSVM = GridSearchCV(pipeline_svm, parameters_svm, n_jobs=-1, cv=KFold(n_splits=5, random_state=0))

GridSearchSVM.fit(train_set_unlabeled, train_labels)

results_SVM = GridSearchSVM.cv_results_

comparison_results = pd.DataFrame({"rank":results_SVM['rank_test_score'],
                                  "kernel":results_SVM["param_svc__kernel"],
                                  "gamma":results_SVM["param_svc__gamma"],
                                  "C":results_SVM["param_svc__C"],
                                  "tfidf":results_SVM["param_tfidf__use_idf"],
                                  "n_gram":results_SVM["param_vect__ngram_range"],
                                  "mean_test_score":results_SVM["mean_test_score"],
                                  "mean_train_score":results_SVM["mean_train_score"]}).set_index("rank")

comparison_results.sort_values('rank', ascending=True)

In [None]:
# Best estimator
for param_name in sorted(parameters_svm.keys()):
    print("%s: %r" % (param_name, GridSearchSVM.best_params_[param_name]))

print("Best Estimator : " + str(GridSearchSVM.best_estimator_.get_params()["svc"]))

svm = GridSearchSVM.best_estimator_

# Prediction on test set
test_svm_prediction = svm.predict(test_set_unlabeled)

# Results
print("\nAccuracy : " + str(metrics.accuracy_score(test_svm_prediction, test_labels)))

skplt.metrics.plot_confusion_matrix(test_labels, test_svm_prediction, normalize=True)
plt.show()

print("Recall : " + str(metrics.recall_score(test_svm_prediction, test_labels)))
print("Precision : " + str(metrics.precision_score(test_svm_prediction, test_labels)))

print("Under ROC curve area : " + str(metrics.roc_auc_score(test_svm_prediction, test_labels)))

## Random Forest 

In [None]:
# Fitting
pipeline_rf = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=0.01, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('rf', RandomForestClassifier()),
                    ])

parameters_rf = {'tfidf__use_idf': (True, False),
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'rf__n_estimators': [1000, 2000, 3000],
}

GridSearchRF = GridSearchCV(pipeline_rf, parameters_rf, n_jobs=-1, cv=KFold(n_splits=5, random_state=0))

GridSearchRF.fit(train_set_unlabeled, train_labels)

results_RF = GridSearchRF.cv_results_

comparison_results = pd.DataFrame({"rank":results_RF['rank_test_score'],
                                  "n_estimators":results_RF["param_rf__n_estimators"],
                                  "tfidf":results_RF["param_tfidf__use_idf"],
                                  "n_gram":results_RF["param_vect__ngram_range"],
                                  "mean_test_score":results_RF["mean_test_score"],
                                  "mean_train_score":results_RF["mean_train_score"]}).set_index("rank")

comparison_results.sort_values('rank', ascending=True)

In [None]:
# Best estimator
for param_name in sorted(parameters_rf.keys()):
    print("%s: %r" % (param_name, GridSearchRF.best_params_[param_name]))

print(" Best Estimator : " + str(GridSearchRF.best_estimator_.get_params()["rf"]))

rf = GridSearchRF.best_estimator_

# Prediction on test set
test_rf_prediction = rf.predict(test_set_unlabeled)

# Results
print("\nAccuracy : " + str(metrics.accuracy_score(test_rf_prediction, test_labels)))

skplt.metrics.plot_confusion_matrix(test_labels, test_rf_prediction, normalize=True)
plt.show()

print("Recall : " + str(metrics.recall_score(test_rf_prediction, test_labels)))
print("Precision : " + str(metrics.precision_score(test_rf_prediction, test_labels)))

print("Under ROC curve area : " + str(metrics.roc_auc_score(test_rf_prediction, test_labels)))

## K Nearest Neighbors

In [None]:
# Fitting
pipeline_knn = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=0.01, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('knn', KNeighborsClassifier()),
                    ])

parameters_knn = {'tfidf__use_idf': (True, False),
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'knn__p': [1,2],
                 'knn__n_neighbors': [25,50,100],
}

GridSearchKNN = GridSearchCV(pipeline_knn, parameters_knn, n_jobs=-1, cv=KFold(n_splits=5, random_state=0))

GridSearchKNN.fit(train_set_unlabeled, train_labels)

results_KNN = GridSearchKNN.cv_results_

comparison_results = pd.DataFrame({"rank":results_KNN['rank_test_score'],
                                  "p":results_KNN["param_knn__p"],
                                  "n_neighbors":results_KNN["param_knn__n_neighbors"],
                                  "tfidf":results_KNN["param_tfidf__use_idf"],
                                  "n_gram":results_KNN["param_vect__ngram_range"],
                                  "mean_test_score":results_KNN["mean_test_score"],
                                  "mean_train_score":results_KNN["mean_train_score"]}).set_index("rank")

comparison_results.sort_values('rank', ascending=True)

In [None]:
# Best estimator
for param_name in sorted(parameters_knn.keys()):
    print("%s: %r" % (param_name, GridSearchKNN.best_params_[param_name]))

print(" Best Estimator : " + str(GridSearchKNN.best_estimator_.get_params()["knn"]))

knn = GridSearchKNN.best_estimator_

# Prediction on test set
test_knn_prediction = knn.predict(test_set_unlabeled)

# Results
print("Accuracy : " + str(metrics.accuracy_score(test_knn_prediction, test_labels)))

skplt.metrics.plot_confusion_matrix(test_labels, test_knn_prediction, normalize=True)
plt.show()

print("Recall : " + str(metrics.recall_score(test_knn_prediction, test_labels)))
print("Precision : " + str(metrics.precision_score(test_knn_prediction, test_labels)))

print("Under ROC curve area : " + str(metrics.roc_auc_score(test_knn_prediction, test_labels)))

# Compare the results

In [None]:
x = [1, 2, 3, 4]
y = [metrics.accuracy_score(test_bayse_prediction, test_labels),
     metrics.accuracy_score(test_svm_prediction, test_labels),
     metrics.accuracy_score(test_rf_prediction, test_labels),
     metrics.accuracy_score(test_knn_prediction, test_labels)
]

labels = ['Naive Bayse', 'SVM', 'Random Forest', 'K Nearest Neighbors']

bar_width = 1.0 

plt.xticks(x, labels, rotation=45)

plt.margins(0.2)
plt.bar(x, y, bar_width, color='blue')
plt.show()

### Text Analysis

In [None]:
words = naive.get_params()["vect"].get_feature_names()

# number of times each word appears across all positive messages
pos_word_count = naive.get_params()["nb"].feature_count_[0, :]

neg_word_count = naive.get_params()["nb"].feature_count_[1, :]

ratio_pos_neg = []
for i in range(len(pos_word_count)) :
    ratio_pos_neg.append(pos_word_count[i]/neg_word_count[i])

words_frequencies = pd.DataFrame({"word":words, "positive":pos_word_count, "negative":neg_word_count, "ratio":ratio_pos_neg}).set_index("word")

words_frequencies.sort_values('ratio', ascending=False)