In [12]:
import pandas as pd

# Results dictionary data
data = {
    'Method': ['BoW encoding', 'Tfidf encoding', 'GloVe encoding', 'Word2Vec encoding', 'LSTM with GloVe encoding', 'LSTM with GloVe encoding', 'DestilBERT', 'DestilBERT', 'BERT', 'nlptown unchanged', 'nlptown finetuned'],
    'Dataset': ['unbalanced', 'unbalanced', 'unbalanced', 'unbalanced', 'unbalanced', 'undersampled', 'undersampled', 'undersampled', 'undersampled', 'undersampled', 'undersampled'],
    'review text': ['processed', 'processed', 'processed', 'processed', 'processed', 'processed', 'processed', 'unprocessed', 'unprocessed', 'unprocessed', 'unprocessed'],
    'Accuracy': ['nan', 0.488, 0.553, 0.568, 0.76, 0.55, 0.59, 0.62, 0.62, 0.54, 0.62],
    'minimal f1 score': ['nan', 'nan', 'nan', 'nan', 0.05, 0.40, 0.44, 0.48, 0.49, 0.40, 0.49]
}

df = pd.DataFrame(data)
df.head(11)


Unnamed: 0,Method,Dataset,review text,Accuracy,minimal f1 score
0,BoW encoding,unbalanced,processed,,
1,Tfidf encoding,unbalanced,processed,0.488,
2,GloVe encoding,unbalanced,processed,0.553,
3,Word2Vec encoding,unbalanced,processed,0.568,
4,LSTM with GloVe encoding,unbalanced,processed,0.76,0.05
5,LSTM with GloVe encoding,undersampled,processed,0.55,0.4
6,DestilBERT,undersampled,processed,0.59,0.44
7,DestilBERT,undersampled,unprocessed,0.62,0.48
8,BERT,undersampled,unprocessed,0.62,0.49
9,nlptown unchanged,undersampled,unprocessed,0.54,0.4


In [3]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from torch.utils.data import Dataset
import torch

df=pd.read_csv(r"data\data_undersampled.csv")
df_text = df[['rating', 'text', 'text_processed']]

# Path to saved model and tokenizer
model_save_path = "finetuned_destilbert_unpr_model"
tokenizer_save_path = "finetuned_destilbert_unpr_tokenizer"

# Load the model and tokenizer

def load_model_and_tokenizer():
    # Load the model and tokenizer
    model = DistilBertForSequenceClassification.from_pretrained(model_save_path)
    tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_save_path)
    return model, tokenizer

# Load the model and tokenizer
model, tokenizer = load_model_and_tokenizer()

# Select the column to use as input 
input_column = 'text'  
     

# Tokenization and Dataset Preparation
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row[input_column]
        rating = row['rating'] -1
        tokens = self.tokenizer(
            text, 
            max_length=self.max_length, 
            truncation=True, 
            padding='max_length', 
            return_tensors='pt'
        )
        return {
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'labels': torch.tensor(rating, dtype=torch.long)
        }


def prepare_datasets(df, _tokenizer, max_length):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_dataset = ReviewDataset(train_df, tokenizer, max_length)
    test_dataset = ReviewDataset(test_df, tokenizer, max_length)
    return train_dataset, test_dataset


max_length = 128
      

# Prepare DataSet
train_dataset, test_dataset = prepare_datasets(df, tokenizer, max_length)


def make_predictions(_model, test_dataset):
                        
    # Make predictions 
    model.eval()  
    predictions1 = []
    true_labels1 = []

    # Iterate over test dataset to make predictions
    for batch in test_dataset:
        input_ids = batch['input_ids'].unsqueeze(0)  
        attention_mask = batch['attention_mask'].unsqueeze(0)  
        labels = batch['labels'].unsqueeze(0)
    
        # Forward pass
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
        # Get predicted class
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    
        predictions1.append(predicted_class)
        true_labels1.append(labels.item())
          
    return predictions1, true_labels1

predictions, true_labels = make_predictions(model, test_dataset)


In [4]:
import pickle

# Save predictions and true_labels to a pickle file
with open('predictions_and_labels_DB.pkl', 'wb') as f:
    pickle.dump({'predictions': predictions, 'true_labels': true_labels}, f)

In [7]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
     
texts = df['text_processed'].astype(str).tolist()  
ratings = df['rating'].astype(int).values  
     
# Tokenize and Convert Text to Sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  

X = tokenizer.texts_to_sequences(texts)

# Pad sequences
max_length = 60  
X = pad_sequences(X, maxlen=max_length, padding='post')

# Encode ratings
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(ratings)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load GloVe Embeddings
embedding_dim = 300
glove_file = "glove.6B.300d.txt"  

embeddings_index = {}
with open(glove_file, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

     

model1 = load_model('rnn_glove_rating_under.keras')

# make predictions
y_pred = model1.predict(X_test)

test_pred_class= y_pred.argmax(axis = 1)
y_test_class = y_test

[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step


In [8]:
# Save test_pred_class and y_test_class to a pickle file
with open('predictions_and_labels_RNN.pkl', 'wb') as f:
    pickle.dump({'test_pred_class': test_pred_class, 'y_test_class': y_test_class}, f)

