In [1]:
#import libraries
import sklearn
import numpy as np
import pandas as pd
import sklearn.metrics
import random
import nltk
import re
from nltk.corpus import sentiwordnet as swn
import seaborn as sns
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups


#Importing Data

###20 newsgroups

In [2]:
#category selection
categories = ['alt.atheism', 'soc.religion.christian']

#fetching data
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)


In [3]:
#putting data in dataframe
newsgroups_train_df = pd.DataFrame({'Text' : newsgroups_train.data})
newsgroups_train_df['Target'] = newsgroups_train.target

newsgroups_bulk_df = pd.DataFrame({'Text' : newsgroups_test.data})
newsgroups_bulk_df['Target'] = newsgroups_test.target

news_df = newsgroups_train_df.append(newsgroups_bulk_df)
## 1 - atheist, 0 - christian
news_df.head()

Unnamed: 0,Text,Target
0,From: nigel.allen@canrem.com (Nigel Allen)\nSu...,1
1,From: marshall@csugrad.cs.vt.edu (Kevin Marsha...,0
2,From: tedr@athena.cs.uga.edu (Ted Kalivoda)\nS...,1
3,From: keith@cco.caltech.edu (Keith Allan Schne...,0
4,From: mayne@ds3.scri.fsu.edu (Bill Mayne)\nSub...,1


###IMDB

In [4]:
imdb_df = pd.read_csv("/content/IMDB Dataset.csv", usecols=["review", "sentiment"], encoding='latin-1')
## 1 - positive, 0 - negative
imdb_df.sentiment = (imdb_df.sentiment == "positive").astype("int")
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#Data Cleaning/Transformation

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
from string import punctuation
import re

#cleaning data method
def clean(text_list):
  clean_data = []
  for text in text_list:
    #lowercase
    text=text.lower()
    #remove non-alphanumeric
    text = re.sub('\W+',' ', text )
    clean_data.append(text)

  return clean_data

###20 newsgroups

In [7]:
#cleaning
news_df["Clean"] = clean(news_df["Text"])

news_df["Clean"].sample()


468    from wjhovi01 ulkyvx louisville edu subject re...
Name: Clean, dtype: object

###IMDB

In [8]:
#cleaning
imdb_df["Clean"] = clean(imdb_df["review"])

imdb_df["Clean"].sample()


29640    very straight not happy with the movie br br t...
Name: Clean, dtype: object

#Sentiment Assignment

In [9]:
import nltk
import ssl
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [10]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('sentiwordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [11]:
def convert_to_swn_tag(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

In [12]:
def get_sentiment(word,tag):
    tag = convert_to_swn_tag(tag)
    
    if tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
        return []

    synsets = wn.synsets(word, pos=tag)
    if not synsets:
        return []

    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [synset.name(), swn_synset.pos_score(),swn_synset.neg_score()]

###20 newsgroups

In [13]:
news_texts = news_df["Text"]
news_tagged_text = []
#sentiment_vals = []

#applying tagging to each word
start = time.time()
for text in news_texts:
    words = word_tokenize(text)
    news_tagged_text.append(nltk.pos_tag(words))
stop = time.time()

news_tag_time = stop - start

In [14]:
#assigning sentiment score
senti_score = []

start = time.time()

for tagged_text in news_tagged_text:
    pos=0
    neg=0  
    senti_val = [get_sentiment(word,tag) for (word,tag) in tagged_text]
    for score in senti_val:
        try:
            pos = pos + score[1]  
            neg = neg + score[2]  
        except:
            continue
    senti_score.append(pos - neg)
stop = time.time()
news_score_time = stop - start


In [15]:
news_df['senti_score'] = senti_score
news_df.head()

Unnamed: 0,Text,Target,Clean,senti_score
0,From: nigel.allen@canrem.com (Nigel Allen)\nSu...,1,from nigel allen canrem com nigel allen subjec...,3.25
1,From: marshall@csugrad.cs.vt.edu (Kevin Marsha...,0,from marshall csugrad cs vt edu kevin marshall...,6.0
2,From: tedr@athena.cs.uga.edu (Ted Kalivoda)\nS...,1,from tedr athena cs uga edu ted kalivoda subje...,1.125
3,From: keith@cco.caltech.edu (Keith Allan Schne...,0,from keith cco caltech edu keith allan schneid...,4.125
4,From: mayne@ds3.scri.fsu.edu (Bill Mayne)\nSub...,1,from mayne ds3 scri fsu edu bill mayne subject...,1.375


###IMDB

In [16]:
imdb_texts = imdb_df["review"]
imdg_tagged_text = []
#sentiment_vals = []

#applying tagging to each word
start = time.time()
for text in imdb_texts:
    words = word_tokenize(text)
    imdg_tagged_text.append(nltk.pos_tag(words))
stop = time.time()

imdb_tag_time = stop - start

In [17]:
#assigning sentiment score
senti_score = []

start = time.time()
for tagged_text in imdg_tagged_text:
    pos=0
    neg=0  
    senti_val = [get_sentiment(word,tag) for (word,tag) in tagged_text]
    for score in senti_val:
        try:
            pos = pos + score[1]  
            neg = neg + score[2]  
        except:
            continue
    senti_score.append(pos - neg)
stop = time.time()

imdb_score_time = stop - start

In [18]:
imdb_df['senti_score'] = senti_score
imdb_df.head()

Unnamed: 0,review,sentiment,Clean,senti_score
0,One of the other reviewers has mentioned that ...,1,one of the other reviewers has mentioned that ...,-2.25
1,A wonderful little production. <br /><br />The...,1,a wonderful little production br br the filmin...,5.25
2,I thought this was a wonderful way to spend ti...,1,i thought this was a wonderful way to spend ti...,2.75
3,Basically there's a family where a little boy ...,0,basically there s a family where a little boy ...,0.125
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei s love in the time of money is a...,9.875


#Results

In [19]:
from sklearn.metrics import accuracy_score

In [20]:
def score_to_pred(senti_score):
  if senti_score > 0:
    return 1
  return 0

###20 newsgroups

In [21]:
#time taken
print("Time taken for tagging: ", news_tag_time)
print("Time taken for score assignment: ", news_score_time)
print("Total time taken: ", news_tag_time + news_score_time)

Time taken for tagging:  39.91319823265076
Time taken for score assignment:  13.049505710601807
Total time taken:  52.96270394325256


In [22]:
#accuracy
news_df["predictions"] = news_df['senti_score'].apply(score_to_pred)
accuracy_score(news_df['Target'], news_df["predictions"])

0.5378619153674833

###IMDB

In [23]:
#time taken
print("Time taken for tagging: ", imdb_tag_time)
print("Time taken for score assignment: ", imdb_score_time)
print("Total time taken: ", imdb_tag_time + imdb_score_time)

Time taken for tagging:  668.4690573215485
Time taken for score assignment:  129.69372010231018
Total time taken:  798.1627774238586


In [24]:
#accuracy
imdb_df["predictions"] = imdb_df['senti_score'].apply(score_to_pred)
accuracy_score(imdb_df['sentiment'], imdb_df["predictions"])

0.61346

#Recap

The tagging and score assignment time on the 20newsgroups dataset is 53 seconds.

The accuracy of a sentiwordnet mapping on the 20newsgroups dataset is 53.78%.

The tagging and score assignment time on the IMDb dataset is 668 seconds, or 11 minutes and 8 seconds.

The accuracy of a sentiwordnet mapping on the IMDb dataset is 61.35%.

#Addendum

Since sentiwordnet is used only for sentiment classification, it tracks that its performance on the 20 newsgroups dataset is around 50%. This is one major limitation of the Lexicon model, in that it requires a preexisting library of words and their leaning towards the specific topic that is being classified. 