In [1]:
#@title Importing the libraries and loading the data
# Main libraries
import pandas as pd
pd.set_option("display.precision", 4)
import numpy as np
from collections import Counter

# Text processing libraries
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Importing the dataset
df = pd.read_csv('reviews.tsv', delimiter='\t',
                 quoting = 3) # 3 is for ignoring "" (double quotes)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#@title NLP object
class NLP(object):
  def __init__(self, data: pd.DataFrame):
    self.df = data
    self.y = self.df.iloc[:, -1].values
  
  def create_stopwords(self) -> list:
    """
    Creates English stopwords from nltk package,
    Excludes negative words for further sentiment analysis
    Returns a list of stopwords
    """
    self.all_stopwords = stopwords.words('english')
    negative_words = ['not', "don't", "aren't", "didn't", "hadn't", "hasn't", 
                  "haven't", "wasn't", "weren't"]
    for word in negative_words:
      self.all_stopwords.remove(word)
    return self.all_stopwords

  def process_review(self, index: int) -> list:
    """
    Takes the index to the DataFrame,
    first removes punctuations,
    and second lowers the letters.
    Returns a list including strings.
    """
    review = re.sub('[^a-zA-Z]', ' ', self.df.iloc[:, 0][index])
    review = review.lower()
    return review.split()

  def remove_special_words(self, reviews: list) -> list:
    """
    Takes the stemmed or lemmatized reviews,
    Identifies the words used once as a special word,
    Returns the reviews without the special words.
    """
    words = [word for s in reviews for word in s.split()]
    counted = Counter(words)
    special_words = [word for word in counted.keys() if counted[word] == 1]
    final_reviews = reviews[:]
    for i, sentence in enumerate(final_reviews):
      new_sentence = []
      for word in sentence.split():
        if word not in special_words:
          new_sentence.append(word)
      final_reviews[i] = ' '.join(new_sentence)
    return final_reviews

  def apply_stemmer(self) -> list:
    """
    Fetches the reviews from the DataFrame,
    (I) process the individual review via process_review(),
    (II) removes the stopwords taken from create_stopwords(),
    (III) removes the conjuctions of verbs (e.g. loved -> love) via Stemmer,
    (IV) removes the special_words via remove_special_words().
    Returns a list for Bag-of-words model
    """
    corpus, stop_words = [], self.create_stopwords()
    for i in range(self.df.shape[0]):
      # Step I
      review = self.process_review(i)
      # Step II and III
      stemmer = SnowballStemmer("english", ignore_stopwords=True)
      review = [stemmer.stem(word) for word in review
              if not word in set(stop_words)]
      corpus.append(' '.join(review))

    # Step IV
    review = self.remove_special_words(corpus)
    return corpus
  
  def apply_lemmatizer(self) -> list:
    """
    Fetches the reviews from the DataFrame,
    (I) process the individual review via process_review(),
    (II) removes the stopwords taken from create_stopwords(),
    (III) removes the conjuctions of verbs (e.g. loved -> love) via Lemmatizer,
    (IV) removes the special_words via remove_special_words().
    Returns a list for Bag-of-words model
    """
    corpus, stop_words = [], self.create_stopwords()
    for i in range(self.df.shape[0]):
      # Step I
      review = self.process_review(i)
      # Step II and III
      lemmatizer = WordNetLemmatizer()
      review = [lemmatizer.lemmatize(word, pos = 'v') for word in review
              if not word in set(stop_words)]
      corpus.append(' '.join(review))
    
    # Step IV
    review = self.remove_special_words(corpus)
    return corpus

  def bow(self) -> list:
    """
    Fetches the data and applies first stemmer and then lemmatizer
    to build the bag-of-word vector.
    Returns a list as [X_stemmer, X_lemmatizer]
    """
    reviews_stemmer = self.apply_stemmer()
    reviews_lemmatizer = self.apply_lemmatizer()
    dataset = []
    for r in [reviews_stemmer, reviews_lemmatizer]:
      cv = CountVectorizer()
      dataset.append(cv.fit_transform(r).toarray())
    return dataset

In [3]:
#@title Training the Classification models on Bag-of-words 

def list_classifiers():
    classifiers = {
        'Logistic Regression' : LogisticRegression(random_state = 42),
        'K-Nearest Neighbors' : KNeighborsClassifier(n_neighbors = 5, p = 2, 
                                                     metric = 'minkowski'),
        'Support Vector Machine' : SVC(kernel = 'linear', random_state = 42),
        'Kernel SVM' : SVC(kernel = 'rbf', random_state = 42),
        'Naive Bayes' : GaussianNB(),
        'Decision Tree' : DecisionTreeClassifier(criterion = 'entropy', 
                                                 random_state = 42),
        'Random Forest' : RandomForestClassifier(n_estimators=100, 
                                                 criterion='entropy',
                                                 random_state = 42),
    }
    return classifiers

# Get the classifiers
classifiers = list_classifiers()
# Define NLP() object with data and build Bag-of-words vector with bow()
model = NLP(df)
data = model.bow() 
# {classifier : [stemmer_score, lemmatizer_score]}
score = {name : [] for name in list(classifiers)} 
for name, classifier in classifiers.items():
  for X in data:
    X_train, X_test, y_train, y_test = train_test_split(X, model.y,
                                                    test_size = 0.25,
                                                    random_state = 42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    score[name].append(f1_score(y_test, y_pred))

In [4]:
#@title Results
indicies = list(score.keys())
result = pd.DataFrame(score.values(), columns = ['Stemmer', 'Lemmatizer'], index = indicies)
result.style

Unnamed: 0,Stemmer,Lemmatizer
Logistic Regression,0.7521,0.7845
K-Nearest Neighbors,0.6176,0.5829
Support Vector Machine,0.7382,0.7731
Kernel SVM,0.7299,0.7383
Naive Bayes,0.7266,0.7413
Decision Tree,0.7311,0.7013
Random Forest,0.7339,0.7215
