In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.svm import LinearSVC
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
import sys

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [3]:
# Commenting for local execution
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# Commenting for local execution
# !unzip /content/drive/MyDrive/Project_Data.zip

In [5]:
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [6]:
class TokenEmbedder:
    def __init__(self, d=768, N=3, B=int(1e9+7), random_state=42):
        """
        Initialize the TokenEmbedder with given parameters.

        :param d: Size of the embedding vector.
        :param N: Number of i-grams to consider.
        :param B: The modulus for the hash function and projection matrix normalization.
        :param random_state: Seed for random number generation to ensure reproducibility.
      `  """
        self.d = d
        self.N = N
        self.B = B
        self.random_state = random_state
        self.hash_seeds = self.initialize_hash_seeds()

    def initialize_hash_seeds(self):
        """Initialize hash seeds with a fixed random state."""
        rng = np.random.RandomState(self.random_state)
        return rng.randint(low=1, high=np.iinfo(np.int32).max, size=self.d, dtype=np.int32)

    def rolling_hash(self, text, i, base=256):
        """Compute rolling hash for all i-grams in a text."""
        h = 0
        for x in range(i):
            h = (h * base + ord(text[x])) % self.B
        yield h
        for x in range(len(text) - i):
            h = (h * base - ord(text[x]) * pow(base, i, self.B) + ord(text[x + i])) % self.B
            yield h

    def compute_projection_matrix(self, s_i, h_i):
        """Compute and transform the projection matrix for i-grams."""
        P_i = np.outer(s_i, h_i) % self.B
        P_i = P_i.astype(np.float64)  # Convert P_i to float64 before the division
        P_i -= (P_i > self.B // 2) * self.B
        P_i /= (self.B // 2)
        return P_i

    def compute_igram_embedding(self, P_i):
        """Compute the embedding for an i-gram by averaging the projection matrix."""
        return np.mean(P_i, axis=0)

    def generate_token_embedding(self, embeddings):
        """Concatenate all i-gram embeddings to form the token embedding."""
        return np.concatenate(embeddings, axis=0)

    def compute_embedding(self, token):
        """
        Compute the embedding for a given token by orchestrating the entire process.

        :param token: The token for which to compute the embedding.
        :return: The computed embedding vector for the token.
        """
        embeddings = []
        l = len(token)
        partitions = np.array_split(self.hash_seeds, self.N)  # Partitioning hash seeds for N i-grams

        for i, h_i in enumerate(partitions, start=1):
            if len(token) < i:  # Check if token is shorter than i
                # Handle short tokens; options might include skipping or using a different approach
                continue  # Skipping in this example

            s_i = np.array(list(self.rolling_hash(token, i)))
            P_i = self.compute_projection_matrix(s_i, h_i)
            e_i = self.compute_igram_embedding(P_i)
            embeddings.append(e_i)

        if not embeddings:  # If no embeddings were generated (e.g., all tokens were too short)
            return np.zeros(self.d)  # Return a zero vector of size d

        return self.generate_token_embedding(embeddings)


In [7]:
df = pd.read_csv('amazon_reviews_us_Office_Products_v1_00.tsv',on_bad_lines='skip', sep='\t')

  df = pd.read_csv('amazon_reviews_us_Office_Products_v1_00.tsv',on_bad_lines='skip', sep='\t')


In [8]:
# keep only reviews and ratings
df = df[['star_rating', 'review_body']]

# Check for null values in the df
df.isnull().any(axis=1).sum()
df = df.dropna()

# it seems that some values of star_rating are string while some are numeric. the below code will give an error and hence i was able to deduce this
# df['sentiment'] = df['star_rating'].map(lambda x: 1 if x > 3 else 0 if x <= 2 else None)
# df.shape

# Convert 'star_rating' to numeric
df['star_rating'] = pd.to_numeric(df['star_rating'], errors='coerce')
df['star_rating'] = df['star_rating'].astype(int)

# Get counts of reviews for each sentiment class
reviews_greater_than_3 = df[df['star_rating'] > 3].shape[0]
reviews_less_than_equal_2 = df[df['star_rating'] <= 2].shape[0]
reviews_equal_3 = df[df['star_rating'] == 3].shape[0]

print("Number of Reviews with Rating > 3:", reviews_greater_than_3)
print("Number of Reviews with Rating <= 2:", reviews_less_than_equal_2)
print("Number of Reviews with Rating = 3:", reviews_equal_3)

# create sentiment column
df['sentiment'] = df['star_rating'].map(lambda x: 0 if x > 3 else 1 if x <= 2 else 2 if x == 3 else None)


# convert sentiment to int type
df['sentiment'] = df['sentiment'].astype(int)

rating_one = df[df['star_rating'] == 1].sample(n=50000, random_state=42)
rating_two = df[df['star_rating'] == 2].sample(n=50000, random_state=42)
rating_three = df[df['star_rating'] == 3].sample(n=50000, random_state=42)
rating_four = df[df['star_rating'] == 4].sample(n=50000, random_state=42)
rating_five = df[df['star_rating'] == 5].sample(n=50000, random_state=42)

downsized_df = pd.concat([rating_one, rating_two, rating_three, rating_four, rating_five])

Number of Reviews with Rating > 3: 2001122
Number of Reviews with Rating <= 2: 445349
Number of Reviews with Rating = 3: 193686


In [9]:
contractions = {"ain't": 'am not / is not / are not / has not / have not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would / he had', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he will have', "he's": 'he is / he has', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would / I had', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "isn't": 'is not', "it'd": 'it would / it had', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is / it has', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't": 'might not', "mightn't've": 'might not have', "must've": 'must have', "mustn't": 'must not', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "shan't": 'shall not', "sha'n't": 'shall not', "shan't've": 'shall not have', "she'd": 'she would / she had', "she'd've": 'she would have', "she'll": 'she will', "she'll've": 'she will have', "she's": 'she is / she has', "should've": 'should have', "shouldn't": 'should not', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so is', "that'd": 'that would', "that'd've": 'that would have', "that's": 'that is / that has', "there'd": 'there had', "there'd've": 'there would have', "there's": 'there is / there has', "they'd": 'they would / they had', "they'd've": 'they would have', "they'll": 'they will', "they'll've": 'they will have', "they're": 'they are', "they've": 'they have', "to've": 'to have', "wasn't": 'was not', "we'd": 'we would / we had', "we'd've": 'we would have', "we'll": 'we will', "we'll've": 'we will have', "we're": 'we are', "we've": 'we have', "weren't": 'were not', "what'll": 'what will', "what'll've": 'what will have', "what're": 'what are', "what's": 'what is / what has', "what've": 'what have', "when's": 'when is', "when've": 'when have', "where'd": 'where did', "where's": 'where is / where has', "where've": 'where have', "who'll": 'who will', "who'll've": 'who will have', "who's": 'who is / who has', "who've": 'who have', "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't": 'will not', "won't've": 'will not have', "would've": 'would have', "wouldn't": 'would not', "wouldn't've": 'would not have', "y'all": 'you all', "y'alls": 'you alls', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have', "you'd": 'you would / you had', "you'd've": 'you would have', "you'll": 'you you will', "you'll've": 'you you will have', "you're": 'you are', "you've": 'you have', "who'd": 'who would / who had', "who're": 'who are'}

def expand_contractions(text):
     for contraction, expansion_options in contractions.items():
        # Select the first option when there are multiple choices
        first_option = expansion_options.split('/')[0].strip()
        text = text.replace(contraction, first_option)
     return text


In [10]:
downsized_df['review_body'] = downsized_df['review_body'].str.lower()
downsized_df['review_body'] = downsized_df['review_body'].apply(lambda x: ' '.join(BeautifulSoup(x, "html.parser").stripped_strings))
downsized_df['review_body'] = downsized_df['review_body'].str.replace('http[s]?://\S+', '', regex=True)
downsized_df['review_body'] = downsized_df['review_body'].str.replace(r'[^a-zA-Z ]', '', regex=True)
downsized_df['review_body'] = downsized_df['review_body'].str.replace(' +', ' ', regex=True)
downsized_df['review_body'] = downsized_df['review_body'].apply(expand_contractions)

  downsized_df['review_body'] = downsized_df['review_body'].apply(lambda x: ' '.join(BeautifulSoup(x, "html.parser").stripped_strings))
  downsized_df['review_body'] = downsized_df['review_body'].apply(lambda x: ' '.join(BeautifulSoup(x, "html.parser").stripped_strings))


# HashBased embedding

In [11]:
# simple_df = downsized_df[downsized_df['sentiment'].isin([0, 1])]

In [12]:
embedder = TokenEmbedder(d=300, N=3, B=int(1e9+7), random_state=42) # Adjust the dimensions and parameters as needed

In [13]:
def create_X_avg_custom_with_embedder(df, embedder):
    X_avg = []

    for review in df['review_body']:
        # Tokenize the review text
        curr_review = review.replace(',', '').replace('.', '').split()
        embeddings = []

        for token in curr_review:
            # Generate an embedding for each token and ensure it's properly shaped
            token_embedding = embedder.compute_embedding(token)
            if token_embedding.shape[0] == embedder.d:  # Check if embedding has expected length
                embeddings.append(token_embedding)

        # Ensure embeddings is a 2D array with consistent inner dimension
        if len(embeddings) > 0:
            embeddings = np.vstack(embeddings)  # Stack embeddings vertically to create a 2D array
            # Average the embeddings for the review
            review_embedding = np.mean(embeddings, axis=0)
        else:
            # If no valid embeddings were generated, use a zero vector
            review_embedding = np.zeros(embedder.d)

        X_avg.append(review_embedding)

    return np.array(X_avg)



In [14]:
X_avg_custom = create_X_avg_custom_with_embedder(downsized_df, embedder)

In [15]:
X_avg_custom.shape

(250000, 300)

In [16]:
X_train_custom, X_test_custom, Y_train_custom, Y_test_custom = train_test_split(X_avg_custom, downsized_df['sentiment'], test_size=0.2, random_state=48)



In [17]:
X_avg_custom.shape

(250000, 300)

In [18]:
X_train_raw_ternary, X_test_raw_ternary, Y_train_raw_ternary, Y_test_raw_ternary = train_test_split(downsized_df['review_body'], downsized_df['sentiment'], test_size=0.2, random_state=48)


In [19]:
class TrainReview(Dataset):
    def __init__(self, reviews, sentiment, token_embedder):
        """
        Initializes the dataset with reviews, sentiment labels, and a token embedder.

        :param reviews: A pandas Series or DataFrame column containing review texts.
        :param sentiment: A pandas Series or DataFrame column containing sentiment labels.
        :param token_embedder: An instance of the TokenEmbedder class.
        """
        self.reviews = reviews
        self.sentiment = sentiment
        self.token_embedder = token_embedder

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        curr_review = self.reviews.iloc[index]
        curr_review = curr_review.replace(',', '').replace('.', '').split()
        curr_vect = []

        for word in curr_review:
            # Use the TokenEmbedder's compute_embedding method to get the embedding
            word_embedding = self.token_embedder.compute_embedding(word)
            if word_embedding.shape[0] == self.token_embedder.d:
                curr_vect.append(word_embedding)
        if len(curr_vect) == 0:
            curr_vect = np.zeros(self.token_embedder.d, dtype=float)  # Use the embedder's dimension
        else:
            curr_vect = np.mean(np.array(curr_vect), axis=0)

        # Convert to pytorch tensor
        curr_vect = torch.from_numpy(curr_vect).float()
        sentiment = self.sentiment.iloc[index]

        return curr_vect, sentiment

In [20]:
class TestReview(Dataset):
    def __init__(self, reviews, sentiment, token_embedder):
        """
        Initializes the dataset with reviews, sentiment labels, and a token embedder.

        :param reviews: A pandas Series or DataFrame column containing review texts.
        :param sentiment: A pandas Series or DataFrame column containing sentiment labels.
        :param token_embedder: An instance of the TokenEmbedder class for generating embeddings.
        """
        self.reviews = reviews
        self.sentiment = sentiment
        self.token_embedder = token_embedder

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        curr_review = self.reviews.iloc[index]
        curr_review = curr_review.replace(',', '').replace('.', '').split()
        curr_vect = []

        for word in curr_review:
            # Use the TokenEmbedder's compute_embedding method to get the embedding for each word
            word_embedding = self.token_embedder.compute_embedding(word)
            if word_embedding.shape[0] == self.token_embedder.d:

                curr_vect.append(word_embedding)

        if len(curr_vect) == 0:
            curr_vect = np.zeros(self.token_embedder.d, dtype=float)  # Use the embedder's dimension
        else:
            curr_vect = np.mean(np.array(curr_vect), axis=0)

        # Convert to PyTorch tensor
        curr_vect = torch.from_numpy(curr_vect).float()
        sentiment = self.sentiment.iloc[index]

        return curr_vect, sentiment


In [21]:
# Assuming token_embedder is an instance of TokenEmbedder
train_data_avg_custom_ternary = TrainReview(X_train_raw_ternary, Y_train_raw_ternary, embedder)
test_data_avg_custom_ternary = TestReview(X_test_raw_ternary, Y_test_raw_ternary, embedder)


In [22]:
# how many samples per batch to load
batch_size = 100
# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(train_data_avg_custom_ternary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data_avg_custom_ternary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_avg_custom_ternary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_avg_custom_ternary, batch_size=batch_size)


In [23]:
class FFNetTernary(nn.Module):
    def __init__(self):
        super(FFNetTernary, self).__init__()
        # number of hidden nodes in each layer (512)
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(300, hidden_1)

        self.fc2 = nn.Linear(hidden_1, hidden_2)

        self.fc3 = nn.Linear(hidden_2, 3)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):

        # Flatten the input if it's not already flattened
        x = x.to(torch.float32)

        # Apply the first linear layer with activation and dropout
        x = F.relu(self.fc1(x))
        x = self.dropout(x)

        # Apply the second linear layer with activation and dropout
        x = F.relu(self.fc2(x))
        x = self.dropout(x)

        # Output layer with two units (ternary classification)
        x = self.fc3(x)

        return x



In [24]:
FFNetCustomTernary = FFNetTernary()
FFNetCustomTernary.to(device)
print(FFNetCustomTernary)

FFNetTernary(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [25]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(FFNetCustomTernary.parameters(), lr=0.001)


In [26]:
n_epochs = 50

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    FFNetCustomTernary.train()  # prep model for training
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        data = data.to(device)
        output = FFNetCustomTernary(data)
        target = target.long()  # Convert target to torch.long
        output = output.to(device)
        target = target.to(device)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    FFNetCustomTernary.eval()  # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        data = data.to(device)
        output = FFNetCustomTernary(data)
        target = target.long()  # Convert target to torch.long
        output = output.to(device)
        target = target.to(device)
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(FFNetCustomTernary.state_dict(), 'FFNetCustomRollingHashTernary.pt')
        valid_loss_min = valid_loss


Epoch: 1 	Training Loss: 1.082549 	Validation Loss: 1.069310
Validation loss decreased (inf --> 1.069310).  Saving model ...
Epoch: 2 	Training Loss: 1.064024 	Validation Loss: 1.059808
Validation loss decreased (1.069310 --> 1.059808).  Saving model ...
Epoch: 3 	Training Loss: 1.058400 	Validation Loss: 1.056487
Validation loss decreased (1.059808 --> 1.056487).  Saving model ...
Epoch: 4 	Training Loss: 1.056316 	Validation Loss: 1.055167
Validation loss decreased (1.056487 --> 1.055167).  Saving model ...
Epoch: 5 	Training Loss: 1.055471 	Validation Loss: 1.054511
Validation loss decreased (1.055167 --> 1.054511).  Saving model ...
Epoch: 6 	Training Loss: 1.054944 	Validation Loss: 1.054072
Validation loss decreased (1.054511 --> 1.054072).  Saving model ...
Epoch: 7 	Training Loss: 1.054539 	Validation Loss: 1.053686
Validation loss decreased (1.054072 --> 1.053686).  Saving model ...
Epoch: 8 	Training Loss: 1.054210 	Validation Loss: 1.053306
Validation loss decreased (1.05368

KeyboardInterrupt: 

In [None]:
def predict(model, dataloader):
    prediction_list = []
    actual_list = []
    for i, batch in enumerate(dataloader):
        inputs, targets = batch
        inputs = inputs.float()
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        prediction_list.append(int(predicted[0]))
        actual_list.append(int(targets[0]))
    total = 0
    for i in range(len(prediction_list)):
        if prediction_list[i] == actual_list[i]:
            total += 1
    accuracy = float(total) / len(prediction_list)
    return accuracy

In [None]:
FFNetCustomTernary.load_state_dict(torch.load('FFNetCustomRollingHashTernary.pt'))

test_loader = torch.utils.data.DataLoader(test_data_avg_custom_ternary, batch_size=1)
print('Accuracy of FNN using average custom Rolling hash vectors (ternary) :',str(predict(FFNetCustomTernary, test_loader)))