# Sentiment Analysis - Natural Language Processing

### Import necessary libraries

In [1]:
# Dataframe
import pandas as pd

# Array
import numpy as np

# Decompress the file
import gzip

# Visualizations
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import matplotlib.colors as colors
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS

# Datetime
from datetime import datetime

# text preprocessing
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata
tokenizer = ToktokTokenizer()
nlp = spacy.load('en', parse=True, tag=True, entity=True)

## Modeling
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import XGBClassifier

## Warnings
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'contractions'

### Reading the Cleaned Dataset

In [270]:
# Read the file
df2 = pd.read_csv('C:/users/eturk/Data_Science/Capstone_Project-Sentiment_Analysis/dataset/cleaned_review_beauty.csv')

In [3]:
# Sample observations
df2.sample(3)

Unnamed: 0,customer,product,rating,review_text,pos_feedback,neg_feedback,rating_class,time,clean_text,tokens
15583,A7OCP4P0S4YO8,B00AE07BDU,3.0,Axe is a favorite My son has used every produc...,0,0,neutral,2013-02-26,axe favorite son use every product axe make lo...,"['axe', 'favorite', 'son', 'use', 'every', 'pr..."
14360,A1P2XYD265YE21,B00A0J084Y,5.0,"Axe Cooling Face Wash, Chilled Axe Cooling Fac...",1,1,good,2013-04-14,axe cool face wash chill axe cool face wash ch...,"['axe', 'cool', 'face', 'wash', 'chill', 'axe'..."
8942,A1QBOC76MIOJYP,B006RFZ8C2,5.0,Biore Blemish Fighting Ice Cleanser This is a ...,0,0,good,2013-05-15,biore blemish fighting ice cleanser nice clean...,"['biore', 'blemish', 'fighting', 'ice', 'clean..."


# Natural Language Processing

## 1. Feature Engineering and Selection

## 2. Data Preprocessing

**Separating Response Variable and Feature**

In [None]:
# Separate train set as X and test set as y
X = df2['clean_text']
y = df2['rating_class']

**Splitting Dataset into Train and Test Set**

In [None]:
# Splitting Dataset into train and test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Print train and test set shape
print ('Train Set Shape\t\t:{}\nTest Set Shape\t\t:{}'.format(X_train.shape, X_test.shape))

## 3. Selecting the Right Evaluation Metric

As the data imbalance is emphasized above, the evaluation of the classifier performance must be carried out using adequate metrics in order to take into account the class distribution and to pay more attention to the minority class. When the positive class is smaller and the ability to detect correctly positive samples is our main focus (correct detection of negatives examples is less important to the problem) we should use precision and recall. For our particular case, based on this thought I will use f1 score which is harmonic average of precision and recall as my evaluation metric.

**Confusion Matrix Plot Function**

In [9]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title = 'Confusion matrix',
                          cmap = plt.cm.summer):
    """
    Create a confusion matrix plot for 'good','neutral', and 'bad' rating values 
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis = 1)[:, np.newaxis]
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title, fontsize = 20)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize = 20)
    plt.yticks(tick_marks, classes, fontsize = 20)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment = "center", 
                 color = "white" if cm[i, j] < thresh else "black", fontsize = 40)
    
    plt.tight_layout()
    plt.ylabel('True Label', fontsize = 30)
    plt.xlabel('Predicted Label', fontsize = 30)

    return plt

def disp_confusion_matrix(y_pred, model_name):
    """
    Display confusion matrix for selected model with countVectorizer
    """
    cm = confusion_matrix(y_test, y_pred)
    fig = plt.figure(figsize=(10, 10))
    plot = plot_confusion_matrix(cm, classes=['Bad','Good','Neutral'], normalize=False, 
                                 title = model_name + " " + 'with CV Confusion Matrix')
    plt.show()
    print(cm)

## 4. Modelling 

This is a supervised multi-class classification problem. We are trying to predict the ratings based on the reviews left by females who bought beauty products in Amazon e-commerce system.  We used Python’s Scikit Learn libraries to solve our problem. In this context, we implemented Logistic Regression, Linear SVM and Random Forest, Gradient Boosting, XGBOOST, Naive Bayes, Catboost and TPOT algorithms. 

Since the ratings of the reviews was not distributed normally as seen below, we decided to decrease rating classes from 5 to 3 by merging Rating 1 and 2 as ‘Bad’, Rating 3 as Neutral and Rating 4 and 5 as Good and applied the algorithms. And finally we decreased the rating classes to 2 to check whether our algorithms will do better with binomial classification problem or not. To do so, we merged Rating 1 and 2 as ‘Bad’ as we did before, but this time Rating 3 joined to 4 and 5 as ‘Not Bad’ and applied the same models afterwards. We have already applied Count Vectorizing and TF-IDF separately to figure out which one has the better performance. Thus we managed to apply all possible combinations to get the best precision scores for the classes.

## 4.1 Bag of Words (CountVectorizer)

Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect numerical feature vectors with a fixed size rather than the raw text documents with variable length.

In order to address this, scikit-learn provides utilities for the most common ways to extract numerical features from text content, namely:

- **tokenizing** strings and giving an integer id for each possible token, for instance by using white-spaces and punctuation as token separators.

- **counting** the occurrences of tokens in each document.

- **normalizing** and weighting with diminishing importance tokens that occur in the majority of samples / documents.

In this scheme, features and samples are defined as follows:

- each **individual token occurrence frequency** (normalized or not) is treated as a feature.

- the vector of all the token frequencies for a given **document** is considered a multivariate sample.
A corpus of documents can thus be represented by a matrix with one row per document and one column per token (e.g. word) occurring in the corpus.

We call **vectorization** the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the **Bag of Words** or “Bag of n-grams” representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document.

**"CountVectorizer"** implements both tokenization and occurrence counting in a single class. 

In [None]:
# Create the word vector with CountVectorizer
count_vect = CountVectorizer(ngram_range=(1,1), min_df = , max_df = )
count_vect_train = count_vect.fit_transform(X_train)
count_vect_train = count_vect_train.toarray()
count_vect_test = count_vect.transform(X_test)
count_vect_test = count_vect_test.toarray()

In [None]:
# Print vocabulary length
print('Vocabulary length :', len(count_vect.get_feature_names()))
print('Longest word   :', max(count_vect.vocabulary_, key=len))

In [None]:
# Assign feature names of vector into a variable
vocab = count_vect.get_feature_names()

In [None]:
# Dataframe for train countvectorizer dataset
pd.DataFrame(count_vect_train, columns = vocab).head()

**Creating a function for models with CountVectorizer**

In [8]:
def modeling_countVec(Model):
    """
    This function apply countVectorizer with machine learning algorithms. 
    """
    
    # Instantiate the classifier: model
    model = Model
    
    # Fitting classifier to the Training set (all features)
    model.fit(X_train, y_train)
    
    global y_pred
    # Predicting the Test set results
    y_pred = model.predict(X_test)
    
    # Assign f1 score to a variable
    score = f1_score(y_test, y_pred, average = 'weighted')
    
    # Printing evaluation metric (f1-score) 
    print("f1 score: {}".format(score))

### 4.1.1 Logistic Regression with CountVectorizer

In [None]:
# Call the modeling function for logistic regression with countvectorizer and print f1 score
modeling_countVec(LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg',
                                     class_weight = 'balanced', C = 0.1, n_jobs = -1, random_state = 42))

**Classification Report**

In [None]:
# Compute and print the classification report
print(classification_report(y_test, y_pred))

**Confusion Matrix**

In [None]:
# Print confusion matrix for logistic regression with countvectorizer
disp_confusion_matrix(y_pred, "Logistic Regression")

### 4.1.2 Linear SVM with CountVectorizer

In [None]:
# Call the modeling function for linear support vector classification with countvectorizer and print f1 score
modeling_countVec(LinearSVC(random_state = 42))

**Classification Report**

In [None]:
# Compute and print the classification report
print(classification_report(y_test, y_pred))

**Confusion Matrix**

In [None]:
# Print confusion matrix for linear SVM with countvectorizer
disp_confusion_matrix(y_pred, "Linear SVM")

### 4.1.3 Random Forest with CountVectorizer

In [None]:
# Call the modeling function for random forest classifier with countvectorizer and print f1 score
modeling_countVec(RandomForestClassifier(n_estimator = 200, random_state = 42))

**Classification Report**

In [None]:
# Compute and print the classification report
print(classification_report(y_test, y_pred))

**Confusion Matrix**

In [None]:
# Print confusion matrix for random forest classifier with countVectorizer
disp_confusion_matrix(y_pred, "Random Forest")

### 4.1.4 Naive Bayes with CountVectorizer 

In [None]:
# Call the modeling function for naive bayes with countvectorizer and print f1 score
modeling_countVec(MultinomialNB())

**Classification Report**

In [None]:
# Compute and print the classification report
print(classification_report(y_test, y_pred))

**Confusion Matrix**

In [None]:
# Print confusion matrix for naive bayes with countVectorizer
disp_confusion_matrix(y_pred, "Random Forest")

### 4.1.5 Kernel SVM with CountVectorizer

In [None]:
# Call the modeling function for kernel SVM with countvectorizer and print f1 score
modeling_countVec(SVC(kernel='rbf'))

**Classification Report**

In [None]:
# Compute and print the classification report
print(classification_report(y_test, y_pred))

**Confusion Matrix**

In [None]:
# Print confusion matrix for kernel SVM with countVectorizer
disp_confusion_matrix(y_pred, "Kernel SVM")

### 4.1.6 Gradient Boosting with CountVectorizer

In [None]:
# Call the modeling function for gradient boosting with countvectorizer and print f1 score
modeling_countVec(GradientBoostingClassifier())

**Classification Report**

In [None]:
# Compute and print the classification report
print(classification_report(y_test, y_pred))

**Confusion Matrix**

In [None]:
# Print confusion matrix for gradient boosting with countVectorizer
disp_confusion_matrix(y_pred, "Gradient Boosting")

### 4.1.7 XGBoost with CountVectorizing

In [None]:
# Call the modeling function for XGBoost with countvectorizer and print f1 score
modeling_countVec(XGBClassifier())

**Classification Report**

In [None]:
# Compute and print the classification report
print(classification_report(y_test, y_pred))

**Confusion Matrix**

In [None]:
# Print confusion matrix for gradient boosting with countVectorizer
disp_confusion_matrix(y_pred, "XGBoost")

### 4.1.8 XGBoost with CountVectorizing

## 4.2 Bag of Words (CountVectorizer)