In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.svm import LinearSVC
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
import sys

In [2]:
class TokenEmbedder:
    def __init__(self, d=768, N=3, B=int(1e9+7), random_state=42):
        """
        Initialize the TokenEmbedder with given parameters.
        
        :param d: Size of the embedding vector.
        :param N: Number of i-grams to consider.
        :param B: The modulus for the hash function and projection matrix normalization.
        :param random_state: Seed for random number generation to ensure reproducibility.
      `  """
        self.d = d
        self.N = N
        self.B = B
        self.random_state = random_state
        self.hash_seeds = self.initialize_hash_seeds()
        self.embedding_dict = {}

    def initialize_hash_seeds(self):
        """Initialize hash seeds with a fixed random state."""
        rng = np.random.RandomState(self.random_state)
        return rng.randint(low=1, high=np.iinfo(np.int32).max, size=self.d, dtype=np.int32)

    def rolling_hash(self, text, i, base=256):
        """Compute rolling hash for all i-grams in a text."""
        h = 0
        for x in range(i):
            h = (h * base + ord(text[x])) % self.B
        yield h
        for x in range(len(text) - i):
            h = (h * base - ord(text[x]) * pow(base, i, self.B) + ord(text[x + i])) % self.B
            yield h

    def compute_projection_matrix(self, s_i, h_i):
        """Compute and transform the projection matrix for i-grams."""
        P_i = np.outer(s_i, h_i) % self.B
        P_i = P_i.astype(np.float64)  # Convert P_i to float64 before the division
        P_i -= (P_i > self.B // 2) * self.B
        P_i /= (self.B // 2)
        return P_i

    def compute_igram_embedding(self, P_i):
        """Compute the embedding for an i-gram by averaging the projection matrix."""
        return np.mean(P_i, axis=0)

    def generate_token_embedding(self, embeddings):
        """Concatenate all i-gram embeddings to form the token embedding."""
        token_embedding = np.concatenate(embeddings, axis=0)
        return token_embedding

    def compute_embedding(self, token):
        """
        Compute the embedding for a given token by orchestrating the entire process.
        
        :param token: The token for which to compute the embedding.
        :return: The computed embedding vector for the token.
        """
        if token not in self.embedding_dict:
            embeddings = []
            l = len(token)
            partitions = np.array_split(self.hash_seeds, self.N)  # Partitioning hash seeds for N i-grams

            for i, h_i in enumerate(partitions, start=1):
                if len(token) < i:  # Check if token is shorter than i
                    # Handle short tokens; options might include skipping or using a different approach
                    continue  # Skipping in this example

                s_i = np.array(list(self.rolling_hash(token, i)))
                P_i = self.compute_projection_matrix(s_i, h_i)
                e_i = self.compute_igram_embedding(P_i)
                embeddings.append(e_i)

            if not embeddings:  # If no embeddings were generated (e.g., all tokens were too short)
                return np.zeros(self.d)  # Return a zero vector of size d

            token_embedding = self.generate_token_embedding(embeddings)
            self.embedding_dict[token] = token_embedding
            return token_embedding
        else:
            return self.embedding_dict[token]
    


In [3]:
df = pd.read_csv('amazon_reviews_us_Office_Products_v1_00.tsv',on_bad_lines='skip', sep='\t')

  df = pd.read_csv('amazon_reviews_us_Office_Products_v1_00.tsv',on_bad_lines='skip', sep='\t')


In [4]:
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43081963,R18RVCKGH1SSI9,B001BM2MAC,307809868,"Scotch Cushion Wrap 7961, 12 Inches x 100 Feet",Office Products,5,0.0,0.0,N,Y,Five Stars,Great product.,2015-08-31
1,US,10951564,R3L4L6LW1PUOFY,B00DZYEXPQ,75004341,"Dust-Off Compressed Gas Duster, Pack of 4",Office Products,5,0.0,1.0,N,Y,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...,2015-08-31
2,US,21143145,R2J8AWXWTDX2TF,B00RTMUHDW,529689027,Amram Tagger Standard Tag Attaching Tagging Gu...,Office Products,5,0.0,0.0,N,Y,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it.",2015-08-31
3,US,52782374,R1PR37BR7G3M6A,B00D7H8XB6,868449945,AmazonBasics 12-Sheet High-Security Micro-Cut ...,Office Products,1,2.0,3.0,N,Y,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...,2015-08-31
4,US,24045652,R3BDDDZMZBZDPU,B001XCWP34,33521401,"Derwent Colored Pencils, Inktense Ink Pencils,...",Office Products,4,0.0,0.0,N,Y,Four Stars,Gorgeous colors and easy to use,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2640249,US,53005790,RLI7EI10S7SN0,B00000DM9M,223408988,PalmOne III Leather Belt Clip Case,Office Products,4,26.0,26.0,N,N,Great value! A must if you hate to carry thing...,I can't live anymore whithout my Palm III. But...,1998-12-07
2640250,US,52188548,R1F3SRK9MHE6A3,B00000DM9M,223408988,PalmOne III Leather Belt Clip Case,Office Products,4,18.0,18.0,N,N,Attaches the Palm Pilot like an appendage,Although the Palm Pilot is thin and compact it...,1998-11-30
2640251,US,52090046,R23V0C4NRJL8EM,0807865001,307284585,Gods and Heroes of Ancient Greece,Office Products,4,9.0,16.0,N,N,"Excellent information, pictures and stories, I...",This book had a lot of great content without b...,1998-10-15
2640252,US,52503173,R13ZAE1ATEUC1T,1572313188,870359649,Microsoft EXCEL 97/ Visual Basic Step-by-Step ...,Office Products,5,0.0,0.0,N,N,class text,I am teaching a course in Excel and am using t...,1998-08-22


In [5]:
# keep only reviews and ratings
df = df[['star_rating', 'review_body']]

# Check for null values in the df
df.isnull().any(axis=1).sum()
df = df.dropna()

# it seems that some values of star_rating are string while some are numeric. the below code will give an error and hence i was able to deduce this
# df['sentiment'] = df['star_rating'].map(lambda x: 1 if x > 3 else 0 if x <= 2 else None)
# df.shape

# Convert 'star_rating' to numeric
df['star_rating'] = pd.to_numeric(df['star_rating'], errors='coerce')
df['star_rating'] = df['star_rating'].astype(int)

# Get counts of reviews for each sentiment class
reviews_greater_than_3 = df[df['star_rating'] > 3].shape[0]
reviews_less_than_equal_2 = df[df['star_rating'] <= 2].shape[0]
reviews_equal_3 = df[df['star_rating'] == 3].shape[0]

print("Number of Reviews with Rating > 3:", reviews_greater_than_3)
print("Number of Reviews with Rating <= 2:", reviews_less_than_equal_2)
print("Number of Reviews with Rating = 3:", reviews_equal_3)

# create sentiment column
df['sentiment'] = df['star_rating'].map(lambda x: 0 if x > 3 else 1 if x <= 2 else 2 if x == 3 else None)


# convert sentiment to int type
df['sentiment'] = df['sentiment'].astype(int)

rating_one = df[df['star_rating'] == 1].sample(n=50000, random_state=42)
rating_two = df[df['star_rating'] == 2].sample(n=50000, random_state=42)
rating_three = df[df['star_rating'] == 3].sample(n=50000, random_state=42)
rating_four = df[df['star_rating'] == 4].sample(n=50000, random_state=42)
rating_five = df[df['star_rating'] == 5].sample(n=50000, random_state=42)

downsized_df = pd.concat([rating_one, rating_two, rating_three, rating_four, rating_five])

Number of Reviews with Rating > 3: 2001122
Number of Reviews with Rating <= 2: 445349
Number of Reviews with Rating = 3: 193686


In [6]:
contractions = {"ain't": 'am not / is not / are not / has not / have not', "aren't": 'are not', "can't": 'cannot', "can't've": 'cannot have', "'cause": 'because', "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would / he had', "he'd've": 'he would have', "he'll": 'he will', "he'll've": 'he will have', "he's": 'he is / he has', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would / I had', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "isn't": 'is not', "it'd": 'it would / it had', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is / it has', "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't": 'might not', "mightn't've": 'might not have', "must've": 'must have', "mustn't": 'must not', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "shan't": 'shall not', "sha'n't": 'shall not', "shan't've": 'shall not have', "she'd": 'she would / she had', "she'd've": 'she would have', "she'll": 'she will', "she'll've": 'she will have', "she's": 'she is / she has', "should've": 'should have', "shouldn't": 'should not', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so is', "that'd": 'that would', "that'd've": 'that would have', "that's": 'that is / that has', "there'd": 'there had', "there'd've": 'there would have', "there's": 'there is / there has', "they'd": 'they would / they had', "they'd've": 'they would have', "they'll": 'they will', "they'll've": 'they will have', "they're": 'they are', "they've": 'they have', "to've": 'to have', "wasn't": 'was not', "we'd": 'we would / we had', "we'd've": 'we would have', "we'll": 'we will', "we'll've": 'we will have', "we're": 'we are', "we've": 'we have', "weren't": 'were not', "what'll": 'what will', "what'll've": 'what will have', "what're": 'what are', "what's": 'what is / what has', "what've": 'what have', "when's": 'when is', "when've": 'when have', "where'd": 'where did', "where's": 'where is / where has', "where've": 'where have', "who'll": 'who will', "who'll've": 'who will have', "who's": 'who is / who has', "who've": 'who have', "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't": 'will not', "won't've": 'will not have', "would've": 'would have', "wouldn't": 'would not', "wouldn't've": 'would not have', "y'all": 'you all', "y'alls": 'you alls', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have', "you'd": 'you would / you had', "you'd've": 'you would have', "you'll": 'you you will', "you'll've": 'you you will have', "you're": 'you are', "you've": 'you have', "who'd": 'who would / who had', "who're": 'who are'}

def expand_contractions(text):
     for contraction, expansion_options in contractions.items():
        # Select the first option when there are multiple choices
        first_option = expansion_options.split('/')[0].strip()
        text = text.replace(contraction, first_option)
     return text

In [7]:
downsized_df['review_body'] = downsized_df['review_body'].str.lower()
downsized_df['review_body'] = downsized_df['review_body'].apply(lambda x: ' '.join(BeautifulSoup(x, "html.parser").stripped_strings))
downsized_df['review_body'] = downsized_df['review_body'].str.replace('http[s]?://\S+', '', regex=True)
downsized_df['review_body'] = downsized_df['review_body'].str.replace(r'[^a-zA-Z ]', '', regex=True)
downsized_df['review_body'] = downsized_df['review_body'].str.replace(' +', ' ', regex=True)
downsized_df['review_body'] = downsized_df['review_body'].apply(expand_contractions)

  downsized_df['review_body'] = downsized_df['review_body'].apply(lambda x: ' '.join(BeautifulSoup(x, "html.parser").stripped_strings))
  downsized_df['review_body'] = downsized_df['review_body'].apply(lambda x: ' '.join(BeautifulSoup(x, "html.parser").stripped_strings))


In [8]:
downsized_df

Unnamed: 0,star_rating,review_body,sentiment
2520757,1,i bought this thinking it would be a good alte...,1
800253,1,i have used transfers many times in the past w...,1
2366081,1,sorry to report that this ink cartridge spille...,1
1159817,1,doesnt work dont buy i tried two times at inst...,1
2118644,1,i replaced the original canon toner cartridge ...,1
...,...,...,...
840862,5,a unique gift that helps save those amazing cr...,0
1281476,5,exactly as described i got two of these one in...,0
804044,5,these are much better than some other brands i...,0
221856,5,excellent supplier just as promised and timely...,0


In [9]:
downsized_df['sentiment'].unique()

array([1, 2, 0])

In [10]:
embedder = TokenEmbedder(d=300, N=3, B=int(1e9+7), random_state=42) # Adjust the dimensions and parameters as needed

In [11]:
# def create_X_avg_custom_with_embedder(df, embedder):
#     X_avg = []

#     for review in df['review_body']:
#         # Tokenize the review text
#         curr_review = review.replace(',', '').replace('.', '').split()
#         embeddings = []

#         for token in curr_review:
#             # Generate an embedding for each token and ensure it's properly shaped
#             token_embedding = embedder.compute_embedding(token)
#             if token_embedding.shape[0] == embedder.d:  # Check if embedding has expected length
#                 embeddings.append(token_embedding)

#         # Ensure embeddings is a 2D array with consistent inner dimension
#         if len(embeddings) > 0:
#             embeddings = np.vstack(embeddings)  # Stack embeddings vertically to create a 2D array
#             # Average the embeddings for the review
#             review_embedding = np.mean(embeddings, axis=0)
#         else:
#             # If no valid embeddings were generated, use a zero vector
#             review_embedding = np.zeros(embedder.d)

#         X_avg.append(review_embedding)

#     return np.array(X_avg)

In [12]:
# X_avg_custom = create_X_avg_custom_with_embedder(downsized_df, embedder)

In [13]:
# X_avg_custom.shape

(250000, 300)

In [14]:
# X_train_custom, X_test_custom, Y_train_custom, Y_test_custom = train_test_split(X_avg_custom, downsized_df['sentiment'], test_size=0.2, random_state=48)

In [15]:
X_train_raw_binary, X_test_raw_binary, Y_train_raw_binary, Y_test_raw_binary = train_test_split(downsized_df['review_body'], downsized_df['sentiment'], test_size=0.2, random_state=48)


In [16]:
# class TrainReview(Dataset):
#     def __init__(self, reviews, sentiment, token_embedder):
#         """
#         Initializes the dataset with reviews, sentiment labels, and a token embedder.
        
#         :param reviews: A pandas Series or DataFrame column containing review texts.
#         :param sentiment: A pandas Series or DataFrame column containing sentiment labels.
#         :param token_embedder: An instance of the TokenEmbedder class.
#         """
#         self.reviews = reviews
#         self.sentiment = sentiment
#         self.token_embedder = token_embedder

#     def __len__(self):
#         return len(self.reviews)

#     def __getitem__(self, index):
#         curr_review = self.reviews.iloc[index]
#         curr_review = curr_review.replace(',', '').replace('.', '').split()
#         curr_vect = []

#         for word in curr_review:
#             # Use the TokenEmbedder's compute_embedding method to get the embedding
#             word_embedding = self.token_embedder.compute_embedding(word)
#             if word_embedding.shape[0] == self.token_embedder.d:
#                 curr_vect.append(word_embedding)
#         if len(curr_vect) == 0:
#             curr_vect = np.zeros(self.token_embedder.d, dtype=float)  # Use the embedder's dimension
#         else:
#             curr_vect = np.mean(np.array(curr_vect), axis=0)

#         # Convert to pytorch tensor
#         curr_vect = torch.from_numpy(curr_vect).float()
#         sentiment = self.sentiment.iloc[index]

#         return curr_vect, sentiment

In [17]:
# class TestReview(Dataset):
#     def __init__(self, reviews, sentiment, token_embedder):
#         """
#         Initializes the dataset with reviews, sentiment labels, and a token embedder.
        
#         :param reviews: A pandas Series or DataFrame column containing review texts.
#         :param sentiment: A pandas Series or DataFrame column containing sentiment labels.
#         :param token_embedder: An instance of the TokenEmbedder class for generating embeddings.
#         """
#         self.reviews = reviews
#         self.sentiment = sentiment
#         self.token_embedder = token_embedder

#     def __len__(self):
#         return len(self.reviews)

#     def __getitem__(self, index):
#         curr_review = self.reviews.iloc[index]
#         curr_review = curr_review.replace(',', '').replace('.', '').split()
#         curr_vect = []

#         for word in curr_review:
#             # Use the TokenEmbedder's compute_embedding method to get the embedding for each word
#             word_embedding = self.token_embedder.compute_embedding(word)
#             if word_embedding.shape[0] == self.token_embedder.d:
                
#                 curr_vect.append(word_embedding)

#         if len(curr_vect) == 0:
#             curr_vect = np.zeros(self.token_embedder.d, dtype=float)  # Use the embedder's dimension
#         else:
#             curr_vect = np.mean(np.array(curr_vect), axis=0)

#         # Convert to PyTorch tensor
#         curr_vect = torch.from_numpy(curr_vect).float()
#         sentiment = self.sentiment.iloc[index]

#         return curr_vect, sentiment


In [18]:
# train_data_avg_custom_binary = TrainReview(X_train_raw_binary, Y_train_raw_binary, embedder)
# test_data_avg_custom_binary = TestReview(X_test_raw_binary, Y_test_raw_binary, embedder)

In [19]:
# # how many samples per batch to load
# batch_size = 100
# # percentage of training set to use as validation
# valid_size = 0.2

# # obtain training indices that will be used for validation
# num_train = len(train_data_avg_custom_binary)
# indices = list(range(num_train))
# np.random.shuffle(indices)
# split = int(np.floor(valid_size * num_train))
# train_idx, valid_idx = indices[split:], indices[:split]

# # define samplers for obtaining training and validation batches
# train_sampler = SubsetRandomSampler(train_idx)
# valid_sampler = SubsetRandomSampler(valid_idx)

# # prepare data loaders
# train_loader = torch.utils.data.DataLoader(train_data_avg_custom_binary, batch_size=batch_size, sampler=train_sampler)
# valid_loader = torch.utils.data.DataLoader(train_data_avg_custom_binary, batch_size=batch_size, sampler=valid_sampler)
# test_loader = torch.utils.data.DataLoader(test_data_avg_custom_binary, batch_size=batch_size)

In [20]:
X_train_raw_binary, X_test_raw_binary, Y_train_raw_binary, Y_test_raw_binary = train_test_split(downsized_df['review_body'], downsized_df['sentiment'], test_size=0.2, random_state=48)

In [21]:
class TrainReviewCNN(Dataset):
    def __init__(self, reviews, sentiment, token_embedder, max_length=50, vector_size=300):
        self.reviews = reviews
        self.sentiment = sentiment
        self.token_embedder = token_embedder
        self.max_length = max_length
        self.vector_size = vector_size

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        curr_review = self.reviews.iloc[index]
        curr_review = curr_review.replace(',', '').replace('.', '').split()
        curr_vect = []
        count = 0
        for word in curr_review:
            if count >= self.max_length:
                break
            word_embedding = self.token_embedder.compute_embedding(word)
            # Ensure the embedding matches the expected vector size
            if word_embedding.shape[0] != self.vector_size:
                # If not, you might need to truncate or pad the embedding
                if word_embedding.shape[0] > self.vector_size:
                    word_embedding = word_embedding[:self.vector_size]  # Truncate
                else:
                    # Pad the embedding with zeros if it's shorter than expected
                    padding = np.zeros(self.vector_size - word_embedding.shape[0], dtype=float)
                    word_embedding = np.concatenate((word_embedding, padding))
            curr_vect.append(word_embedding)
            count += 1


        # if review is less than max_length words, append zeros
        while count < self.max_length:
            curr_vect.append(np.zeros(self.vector_size, dtype=float))
            count += 1
        if len(curr_vect) == 0:
            curr_vect = np.zeros([self.max_length, self.vector_size], dtype=float)
        else:
            curr_vect = np.array(curr_vect)
        curr_vect = np.transpose(curr_vect)
        # Convert to pytorch tensor
        curr_vect = torch.from_numpy(curr_vect).float()
        sentiment = self.sentiment.iloc[index]
    
        return curr_vect, sentiment


In [22]:
class TestReviewCNN(Dataset):
    def __init__(self, reviews, sentiment, token_embedder, max_length=50, vector_size=300):
        self.reviews = reviews
        self.sentiment = sentiment
        self.token_embedder = token_embedder
        self.max_length = max_length
        self.vector_size = vector_size

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        curr_review = self.reviews.iloc[index]
        curr_review = curr_review.replace(',', '').replace('.', '').split()
        curr_vect = []
        count = 0
        for word in curr_review:
            if count >= self.max_length:
                break
            word_embedding = self.token_embedder.compute_embedding(word)
            # Ensure the embedding matches the expected vector size
            if word_embedding.shape[0] != self.vector_size:
                # If not, you might need to truncate or pad the embedding
                if word_embedding.shape[0] > self.vector_size:
                    word_embedding = word_embedding[:self.vector_size]  # Truncate
                else:
                    # Pad the embedding with zeros if it's shorter than expected
                    padding = np.zeros(self.vector_size - word_embedding.shape[0], dtype=float)
                    word_embedding = np.concatenate((word_embedding, padding))
            curr_vect.append(word_embedding)
            count += 1

        # if review is less than max_length words, append zeros
        while count < self.max_length:
            curr_vect.append(np.zeros(self.vector_size, dtype=float))
            count += 1
        if len(curr_vect) == 0:
            curr_vect = np.zeros([self.max_length, self.vector_size], dtype=float)
        else:
            curr_vect = np.array(curr_vect)
        curr_vect = np.transpose(curr_vect)
        # Convert to pytorch tensor
        curr_vect = torch.from_numpy(curr_vect).float()
        sentiment = self.sentiment.iloc[index]
    
        return curr_vect, sentiment


In [23]:
train_data_cnn_custom_binary = TrainReviewCNN(X_train_raw_binary, Y_train_raw_binary, embedder)
test_data_cnn_custom_binary = TestReviewCNN(X_test_raw_binary, Y_test_raw_binary, embedder)

In [24]:
batch_size = 100
valid_size = 0.2

num_train = len(train_data_cnn_custom_binary)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader = torch.utils.data.DataLoader(train_data_cnn_custom_binary, batch_size=batch_size, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(train_data_cnn_custom_binary, batch_size=batch_size, sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(test_data_cnn_custom_binary, batch_size=batch_size)

In [25]:
class TernaryCNN(nn.Module):
    def __init__(self, output_channels1=50, output_channels2=10, max_length=50, vector_size=300, num_classes=3):
        super(TernaryCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=vector_size, out_channels=output_channels1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=output_channels1, out_channels=output_channels2, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2)

        self.fc1 = nn.Linear(120, num_classes)

        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.3)

    def forward(self, x):
        x = F.relu(self.pool(self.conv1(x)))
        x = self.dropout1(x)
        x = F.relu(self.pool(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        return x

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [27]:
CustomTernaryCNN = TernaryCNN().to(device)
print(CustomTernaryCNN)

TernaryCNN(
  (conv1): Conv1d(300, 50, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(50, 10, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=120, out_features=3, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (dropout2): Dropout(p=0.3, inplace=False)
)


In [28]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(CustomTernaryCNN.parameters(), lr=0.1)

In [29]:
n_epochs = 30

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf  # set initial "min" to infinity

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    CustomTernaryCNN.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        data = data.float()
        data = data.to(device)
        output = CustomTernaryCNN(data)
        target = target.long()
        target = target.to(device)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    CustomTernaryCNN.eval()
    for data, target in valid_loader:
        data = data.float()
        output = CustomTernaryCNN(data)
        target = target.long()  # Convert target to torch.long
        loss = criterion(output, target)
        valid_loss += loss.item() * data.size(0)

    train_loss = train_loss / (len(train_loader) * batch_size)
    valid_loss = valid_loss / (len(valid_loader) * batch_size)

    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch + 1,
        train_loss,
        valid_loss
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(CustomTernaryCNN.state_dict(), 'CustomTernaryCNN.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.885687 	Validation Loss: 0.779584
Validation loss decreased (inf --> 0.779584).  Saving model ...
Epoch: 2 	Training Loss: 0.762666 	Validation Loss: 0.730206
Validation loss decreased (0.779584 --> 0.730206).  Saving model ...
Epoch: 3 	Training Loss: 0.731783 	Validation Loss: 0.717856
Validation loss decreased (0.730206 --> 0.717856).  Saving model ...
Epoch: 4 	Training Loss: 0.713301 	Validation Loss: 0.704750
Validation loss decreased (0.717856 --> 0.704750).  Saving model ...
Epoch: 5 	Training Loss: 0.701194 	Validation Loss: 0.685716
Validation loss decreased (0.704750 --> 0.685716).  Saving model ...
Epoch: 6 	Training Loss: 0.692955 	Validation Loss: 0.690633
Epoch: 7 	Training Loss: 0.682242 	Validation Loss: 0.679385
Validation loss decreased (0.685716 --> 0.679385).  Saving model ...
Epoch: 8 	Training Loss: 0.677672 	Validation Loss: 0.677982
Validation loss decreased (0.679385 --> 0.677982).  Saving model ...
Epoch: 9 	Training Loss: 0.671703 

In [31]:
def predict(model, dataloader):
    prediction_list = []
    actual_list = []
    for i, batch in enumerate(dataloader):
        inputs, targets = batch
        inputs = inputs.float()
        # inputs = inputs.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        prediction_list.append(int(predicted[0]))
        actual_list.append(int(targets[0]))
    total = 0
    for i in range(len(prediction_list)):
        if prediction_list[i] == actual_list[i]:
            total += 1
    accuracy = float(total) / len(prediction_list)
    return accuracy

CustomTernaryCNN.load_state_dict(torch.load('CustomTernaryCNN.pt'))

test_loader = torch.utils.data.DataLoader(test_data_cnn_custom_binary, batch_size=1)
print('Accuracy of CNN using average custom Rolling hash vectors (Ternary) :',str(predict(CustomTernaryCNN, test_loader)))


Accuracy of CNN using average custom Rolling hash vectors (Ternary) : 0.71704
