## SentiGames: Game Theoretic approach to Sentiment Analysis


### 1. Importing the wordnet and the sentiment lexicons

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Testing tensorflow

In [None]:
import os
import shutil
import tensorflow as tf


url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

Downloading data from https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/download
  16384/Unknown - 0s 0us/step

FileNotFoundError: ignored

In [None]:
raw_train_ds

<BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [None]:
import numpy as np
import string
import re
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('not')
stop_words.remove('no')
stop_words.remove('nor')
stop_words.remove('don\'t')
stop_words.remove('doesn\'t')
stop_words.remove('didn\'t')
stop_words.remove('isn\'t')
stop_words.remove('aren\'t')
stop_words.remove('wasn\'t')
stop_words.remove('weren\'t')
stop_words.remove('won\'t')
stop_words.remove('wouldn\'t')
stop_words.remove('shouldn\'t')
stop_words.remove('couldn\'t')
stop_words.remove('mightn\'t')
stop_words.remove('mustn\'t')

def remove_stop_words(sentence_words):
		return [word for word in sentence_words if word not in stop_words]

def preprocess_sentence(sentence):
    lowercase = tf.strings.lower(sentence)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    text = tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')
    text = text.numpy().decode('utf-8')
    sentence_words = nltk.word_tokenize(text)
    sentence_words = remove_stop_words(sentence_words)
    for word in sentence_words:
        word = lemmatizer.lemmatize(word)
    return sentence_words

def extract_senti_synsets(words):
  senti_synsets = []
  sense_count = []
  new_words = []

  for word in words:
    word_synset=[]
    for synset in swn.senti_synsets(word):
      word_synset.append(synset)

    unique_word_synsets = []
    for senti_synset in word_synset:
      if senti_synset not in unique_word_synsets:
        if senti_synset.synset.name().split('.')[1] != 'n':
          unique_word_synsets.append(senti_synset)

    if not len(unique_word_synsets) <= 5:   # Number of unique senses = 5 (assumption)
        unique_word_synsets = unique_word_synsets[0:5]

    for senti_synset in unique_word_synsets:
        senti_synsets.append(senti_synset)

    if len(unique_word_synsets) > 0:
      sense_count.append(len(unique_word_synsets))
      new_words.append(word)

  return senti_synsets, new_words, sense_count

def cosine_similarity(senti_synset1, senti_synset2):
  dot = senti_synset1.pos_score()*senti_synset2.pos_score() + senti_synset1.neg_score()*senti_synset2.neg_score() + senti_synset1.obj_score()*senti_synset2.obj_score()
  norm1 = np.sqrt(senti_synset1.pos_score()**2 + senti_synset1.neg_score()**2 + senti_synset1.obj_score()**2)
  norm2 = np.sqrt(senti_synset2.pos_score()**2 + senti_synset2.neg_score()**2 + senti_synset2.obj_score()**2)
  if norm1 == 0 or norm2 == 0:
    return 0
  else:
    return dot/(norm1*norm2)

def replicator_dynamic(words, payoff_matrix, num_iterations, senti_count, senti_start_index):
    for i in range(num_iterations):
        for player in range(len(words)):
            player_payoff = 0
            strategy_payoff = np.zeros((senti_count[player], 1), dtype=float)
            sense_preference_player = np.array(strategy_space[player:player + 1,
                                               senti_start_index[player]:senti_start_index[player] + senti_count[
                                                   player]])
            for neighbour in range(len(words)):
                if neighbour == player:
                    continue
                temp_matrix = np.array(payoff_matrix[
                                         senti_start_index[player]:senti_start_index[player] + senti_count[player],
                                         senti_start_index[neighbour]:senti_start_index[neighbour] + senti_count[
                                             neighbour]], dtype=float)
                sense_preference_neighbour = np.array(strategy_space[neighbour:neighbour + 1,
                                                      senti_start_index[neighbour]:senti_start_index[neighbour] +
                                                                                   senti_count[neighbour]])
                sense_preference_neighbour = sense_preference_neighbour.transpose()
                current_payoff = np.dot(temp_matrix, sense_preference_neighbour)
                strategy_payoff = np.add(current_payoff, strategy_payoff)
                player_payoff += np.dot(sense_preference_player, current_payoff)
            updation_values = np.ones(strategy_payoff.shape)
            if not player_payoff == 0:
                updation_values = np.divide(strategy_payoff, player_payoff)
            for j in range(0, senti_count[player]):
                strategy_space[player][senti_start_index[player] + j] = strategy_space[player][
                                                                            senti_start_index[player] + j] * \
                                                                        updation_values[j]

def generate_sentiment_similarity_matrix(senti_synsets):
  similarity_matrix = np.zeros((len(senti_synsets), len(senti_synsets)), dtype=float)
  for i in range(len(senti_synsets)):
    for j in range(i, len(senti_synsets)):
      similarity = cosine_similarity(senti_synsets[i], senti_synsets[j])

      if similarity == None:
        similarity = 0
      similarity_matrix[i][j] = similarity
      similarity_matrix[j][i] = similarity
  return similarity_matrix

### 2. Defining a sample sentence to work upon and make it lower case


In [None]:
example = "I really liked the movie Avengers as it was really awesome"
removed_stop_words = preprocess_sentence(example)

removed_stop_words

['really', 'liked', 'movie', 'avengers', 'really', 'awesome']

### 3. Importing glove which is comprised of 6 Billion words with 50 dimentions






In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
f = open("/content/drive/MyDrive/glove.6B.50d.txt", encoding='utf8')

embedding_index = dict()

for line in f:
    values = line.split()
    word = values[0]
    emb = np.array(values[1:], dtype ='float')
    embedding_index[word] = emb
    

In [None]:
print(embedding_index['awesome'])

[-0.35848  -0.1155    0.11371   0.46814   0.7495   -0.61523   0.47639
  0.090754  0.3689    0.50331  -0.22467   0.234    -0.64901   0.055667
  0.30192  -0.13536   0.93473   0.88677  -0.70756  -0.48408  -0.90625
  0.62314  -0.18793  -0.5102    1.2565   -0.28897  -1.2819    0.30284
  1.0423   -0.54885   1.0054    0.62053   0.31879  -0.060822 -0.24919
  0.5019    0.41171   0.13648  -0.49815  -0.59822  -0.16876  -0.26096
 -0.53283   0.20083  -0.19095  -0.028693  0.090843 -0.11063  -0.040858
  0.88439 ]


 Here each word is represented as a vector of 50 numerical values

### 4. Defining a function to get_embedding_output

It will take a text as an input and then convert each word into a vector of 50 elements.

In [None]:
import numpy as np

def get_embedding_output(X):
    maxLen = 10
    embedding_output = np.zeros((len(X), maxLen, 50))

    for ix in range(np.asarray(X).shape[0]):
        my_example = X[ix].split()
       
        for ij in range(len(my_example)):
            if (embedding_index.get(my_example[ij].lower()) is not None) and (ij < maxLen):
                embedding_output[ix][ij] = embedding_index[my_example[ij].lower()]

    return embedding_output

In [None]:
x_train_embed = get_embedding_output(removed_stop_words)
x_train_embed[0]

array([[ 1.6675e-03, -1.6376e-01, -9.2648e-02, -3.3466e-01,  7.3972e-01,
        -2.3523e-01, -3.4941e-01,  1.9102e-01, -4.2223e-01,  5.8440e-01,
        -2.7604e-01,  4.6605e-01, -9.7154e-01,  3.5971e-02,  8.9279e-01,
         5.0195e-01,  8.9409e-01,  3.5050e-01,  1.2178e-01, -9.1063e-01,
        -6.7188e-01,  8.4035e-01,  3.1734e-01,  3.3727e-01,  1.3483e+00,
        -1.9291e+00, -1.1992e+00,  6.0348e-01,  1.2938e+00, -9.2512e-01,
         3.2757e+00,  7.5342e-01,  6.4755e-02, -3.1481e-01, -3.7328e-01,
        -2.3711e-01, -2.5322e-01,  5.5946e-01,  2.6690e-01, -6.6446e-01,
        -4.2612e-01, -5.1564e-02, -1.8357e-02,  4.1999e-01,  3.5430e-01,
         2.6320e-01,  5.1319e-02, -2.4906e-02, -6.9572e-02,  1.1343e+00],
       [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.

In [None]:
removed_stop_words

'really'

In [None]:
all_senti_synsets, removed_noun_words, sense_count  = extract_senti_synsets(removed_stop_words)

sense_start_index = [0]
for i in range(len(sense_count)-1):
  sense_start_index.append(sense_start_index[i]+sense_count[i])

sense_start_index


[0, 4, 9, 13]

### 5. Forming strategy space


In [None]:
strategy_space = np.zeros((len(removed_noun_words), len(all_senti_synsets)), dtype=float)
start_index=0
for i in range(len(removed_noun_words)):
  for j in range(start_index, start_index+sense_count[i]):
    strategy_space[i][j] = 1.0/sense_count[i]

  start_index += sense_count[i]

strategy_space

array([[1.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.2 , 0.2 , 0.2 , 0.2 , 0.2 , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.25, 0.25, 0.25, 0.25, 0.  ,
        0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.2 ,
        0.2 , 0.2 , 0.2 , 0.2 ]])

### 6. Defining Cosine sentiment similarity function for senti_synsets

### 7. Calculation of Payoff Matrix

In [None]:

sentiment_similarity_matrix = generate_sentiment_similarity_matrix(all_senti_synsets)
sentiment_similarity_matrix

array([[1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.70710678,
        1.        , 0.70710678, 1.        , 1.        , 1.        ],
       [0.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.70710678,
        0.        , 0.70710678, 0.        , 0.        , 0.        ],
       [0.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.70710678,
        0.        , 0.70710678, 0.        , 0.        , 0.        ],
       [0.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.70710678,
        0.        , 0.70710678, 0.        , 0.        , 0.        ],
       [0.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.70710678,
        0.        , 0.70710678, 0.        , 

### 8. defining Replicator Dynamic as a function

In [None]:
number_of_iterations = 20

# replicator_dynamic(words, payoff_matrix, num_iterations, senti_count, senti_start_index)

replicator_dynamic(removed_noun_words, sentiment_similarity_matrix, number_of_iterations, sense_count,
                   sense_start_index)
print ("\nStrategy Space Updated")
print (strategy_space)


Strategy Space Updated
[[1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 2.00000000e-01 2.00000000e-01 2.00000000e-01
  2.00000000e-01 2.00000000e-01 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 2.53298603e-04 2.53298603e-04
  2.53298603e-04 9.99240104e-01 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 4.81971695e-04 9.98072113e-01
  4.81971695e-04 4.81971695e-04 4.81971695e-04]]


### 9. Getting relevant senti senses

In [None]:
for word in range(len(removed_noun_words)):
  print(removed_noun_words[word] + ": ", end="")
  max_value = 0
  required_synset = None
  for synset in range(len(all_senti_synsets)):
    if strategy_space[word][synset] > max_value:
        max_value = strategy_space[word][synset]
        required_synset = all_senti_synsets[synset]
  print(required_synset)

not: <not.r.01: PosScore=0.0 NegScore=0.625>
like: <wish.v.02: PosScore=0.125 NegScore=0.0>
really: <very.r.01: PosScore=0.25 NegScore=0.25>
bad: <bad.s.02: PosScore=0.25 NegScore=0.25>


In [None]:
results= []
expected_results = []
x = 1
for text_batch, label_batch in raw_train_ds.take(20):
    for idx in range(text_batch.shape[0]):
        print('processing document %d' % x)
        x += 1
        words = preprocess_sentence(text_batch[idx])
        senti_synsets, new_words, sense_count = extract_senti_synsets(words)
        sense_start_index = [0]
        for i in range(len(sense_count) - 1):
            sense_start_index.append(sense_start_index[i] + sense_count[i])
        similarity_matrix = generate_sentiment_similarity_matrix(senti_synsets)
        strategy_space = np.zeros((len(new_words), len(senti_synsets)), dtype=float)
        start_index = 0
        for i in range(len(new_words)):
            for j in range(start_index, start_index + sense_count[i]):
                strategy_space[i][j] = 1.0 / sense_count[i]

            start_index += sense_count[i]
        replicator_dynamic(new_words, similarity_matrix, 5, sense_count, sense_start_index)
        pos_score = 0
        neg_score = 0
        for word in range(len(new_words)):
            max_value = 0
            required_synset = None
            for synset in range(len(senti_synsets)):
                if strategy_space[word][synset] > max_value:
                    max_value = strategy_space[word][synset]
                    required_synset = senti_synsets[synset]
                if required_synset is not None:
                  pos_score += required_synset.pos_score()
                  neg_score += required_synset.neg_score()

        if pos_score > neg_score:
            results.append(1)
        else:
            results.append(0)
        expected_results.append(label_batch.numpy()[idx])






processing document 1
processing document 2
processing document 3
processing document 4
processing document 5
processing document 6
processing document 7
processing document 8
processing document 9
processing document 10
processing document 11
processing document 12
processing document 13
processing document 14
processing document 15
processing document 16
processing document 17
processing document 18
processing document 19
processing document 20
processing document 21
processing document 22
processing document 23
processing document 24
processing document 25
processing document 26
processing document 27
processing document 28
processing document 29
processing document 30
processing document 31
processing document 32
processing document 33
processing document 34
processing document 35
processing document 36
processing document 37
processing document 38
processing document 39
processing document 40
processing document 41
processing document 42
processing document 43
processing document 

In [None]:
from sklearn import metrics
print(metrics.confusion_matrix(expected_results, results, labels=[0, 1]))
print(metrics.classification_report(expected_results, results, labels=[0, 1]))

[[165 178]
 [ 63 234]]
              precision    recall  f1-score   support

           0       0.72      0.48      0.58       343
           1       0.57      0.79      0.66       297

    accuracy                           0.62       640
   macro avg       0.65      0.63      0.62       640
weighted avg       0.65      0.62      0.62       640

