#IMPORTING LIBS

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import label_binarize

#READING DATASET

In [None]:
import pandas as pd
file = 'dataset.csv'
data = pd.read_csv(file, delimiter=',', encoding='latin1')
data = data.iloc[:, :-1]

reviews = data['text'].tolist()
labels = data['sentiment'].tolist()

In [None]:
print(data.columns)

Index(['id', 'sentiment', 'date', 'text', 'Unnamed: 4'], dtype='object')


In [None]:
data

Unnamed: 0,id,sentiment,date,text,Unnamed: 4
0,623495523,1,Mon Dec 01 20:46:01 +0000 2014,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,
1,623495527,1,Mon Dec 01 21:09:50 +0000 2014,@apple Contact sync between Yosemite and iOS8 ...,
2,623495529,1,Mon Dec 01 21:35:14 +0000 2014,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,
3,623495536,1,Mon Dec 01 23:55:55 +0000 2014,"@Apple, For the love of GAWD, CENTER the '1'on...",
4,623495537,1,Tue Dec 02 00:06:05 +0000 2014,i get the storage almost full notification lit...,
...,...,...,...,...,...
3881,623499108,not_relevant,Tue Dec 09 08:30:04 +0000 2014,#AAPL:Accel Partners Leads $50M Series C Fundi...,
3882,623499141,not_relevant,Tue Dec 09 11:52:37 +0000 2014,Counting down the minutes! Interest in full ti...,
3883,623499180,not_relevant,Tue Dec 09 14:13:17 +0000 2014,@JustinPulitzer Any comment on #aapl today?,
3884,623499194,not_relevant,Tue Dec 09 14:39:59 +0000 2014,Have been brave and taken out an #AAPL CFD as ...,


#SPLITTING TRAIN AND TEST

splitting 1:2 ratio

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

#USING VECTORIZATION

##BOW(RAW COUNT)

In [None]:
count = CountVectorizer()
X_train_counts = count.fit_transform(X_train)
X_test_counts = count.transform(X_test)

#BOW ON TF-IDF

In [None]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

#N-GRAM MODEL

In [None]:
ngram = CountVectorizer(ngram_range=(1, 3))
X_train_ngrams = ngram.fit_transform(X_train)
X_test_ngrams = ngram.transform(X_test)

#EVALUATION MODELS

In [None]:
def evaluate(classifier, X_train, y_train, X_test, y_test):

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    #acc= accuracy_score(y_pred,y_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision_macro = precision_score(y_test, y_pred, average='macro')
    precision_micro = precision_score(y_test, y_pred, average='micro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    recall_micro = recall_score(y_test, y_pred, average='micro')
    f_score_macro = f1_score(y_test, y_pred, average='macro')
    f_score_micro = f1_score(y_test, y_pred, average='micro')

    return accuracy, precision_macro, precision_micro, recall_macro, recall_micro, f_score_macro, f_score_micro


#CLASSIFIERS

In [None]:
naive_bayes = MultinomialNB()
logistic_regression = LogisticRegression()
random_forest = RandomForestClassifier()
svm = SVC()
perceptron = Perceptron()

#EVALUATING THE CLASSIFIERS

In [None]:
classifiers = [naive_bayes, logistic_regression, random_forest, svm, perceptron]
vectorizers = ['Bag of Words (Raw Counts)', 'Bag of Words (TfIDF)', 'N-gram']
results = []

In [None]:
for classifier in classifiers:
    for vectorizer in vectorizers:
        if vectorizer == 'Bag of Words (Raw Counts)':
            accuracy, precision_macro, precision_micro, recall_macro, recall_micro, f_score_macro, f_score_micro = evaluate(classifier, X_train_counts, y_train, X_test_counts, y_test)
        if vectorizer == 'N-gram':
            accuracy, precision_macro, precision_micro, recall_macro, recall_micro, f_score_macro, f_score_micro = evaluate(classifier, X_train_ngrams, y_train, X_test_ngrams, y_test)
        elif vectorizer == 'Bag of Words (TfIDF)':
            accuracy, precision_macro, precision_micro, recall_macro, recall_micro, f_score_macro, f_score_micro = evaluate(classifier, X_train_tfidf, y_train, X_test_tfidf, y_test)

        results.append((classifier.__class__.__name__, vectorizer, accuracy, precision_macro, precision_micro, recall_macro, recall_micro, f_score_macro, f_score_micro))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i =

#TABLE RESULTS

In [None]:
for result in results:
  print(result)

('MultinomialNB', 'Bag of Words (Raw Counts)', 0.7262210796915167, 0.5255959256536805, 0.7262210796915167, 0.4296735276203334, 0.7262210796915167, 0.425849906444206, 0.7262210796915167)
('MultinomialNB', 'Bag of Words (TfIDF)', 0.7519280205655527, 0.6324120221721697, 0.7519280205655527, 0.4230369806760474, 0.7519280205655527, 0.4225282163426493, 0.7519280205655527)
('MultinomialNB', 'N-gram', 0.7429305912596401, 0.522883474656831, 0.7429305912596401, 0.43472698273219973, 0.7429305912596401, 0.4337447809419109, 0.7429305912596401)
('LogisticRegression', 'Bag of Words (Raw Counts)', 0.7480719794344473, 0.5098811789002026, 0.7480719794344473, 0.4518672433592098, 0.7480719794344473, 0.46256840341085614, 0.7480719794344473)
('LogisticRegression', 'Bag of Words (TfIDF)', 0.7403598971722365, 0.5448218637255522, 0.7403598971722365, 0.43482324363853614, 0.7403598971722365, 0.44473003294416213, 0.7403598971722365)
('LogisticRegression', 'N-gram', 0.7390745501285347, 0.5335393700787402, 0.7390745

In [None]:
print("Multiclass Classification Results:")
print("-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------")
print("| Classifier            | Feature Extraction       | Accuracy | Precision (Macro) | Precision (Micro) | Recall (Macro) | Recall (Micro) | F-Score (Macro) | F-Score (Micro) |")
print("|-----------------------|--------------------------|----------|-------------------|-------------------|----------------|----------------|-----------------|-----------------|")
for result in results:
    classifier, vectorizer, accuracy, precision_macro, precision_micro, recall_macro, recall_micro, f_score_macro, f_score_micro = result
    print(f"| {classifier:21} | {vectorizer:25} | {accuracy:8.4f} | {precision_macro:17.4f} | {precision_micro:17.4f} | {recall_macro:14.4f} | {recall_micro:14.4f} | {f_score_macro:15.4f} | {f_score_micro:15.4f} |")
print("------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")

Multiclass Classification Results:
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| Classifier            | Feature Extraction       | Accuracy | Precision (Macro) | Precision (Micro) | Recall (Macro) | Recall (Micro) | F-Score (Macro) | F-Score (Micro) |
|-----------------------|--------------------------|----------|-------------------|-------------------|----------------|----------------|-----------------|-----------------|
| MultinomialNB         | Bag of Words (Raw Counts) |   0.7262 |            0.5256 |            0.7262 |         0.4297 |         0.7262 |          0.4258 |          0.7262 |
| MultinomialNB         | Bag of Words (TfIDF)      |   0.7519 |            0.6324 |            0.7519 |         0.4230 |         0.7519 |          0.4225 |          0.7519 |
| MultinomialNB         | N-gram                    |   0.7429 |            0.5229 |         

#SUMMARY

Classifiers having higher precision,accuracy and f_score values are working better than the rest on specific vectorization

The major difference between micro and macro avergig is that micro treats the dataset as a whole, it treats the multi class as a single binary class classification on the other hand macro averging treats and gives evry class a chance, it evaluates the classifer on each class.

 - Precision: how accurately the class got predicted true
 - Recall: how accurately the class got identified as true
 - F_score: Harmonic mean of both, higher value means good balance between precison and recall.