In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
def clean_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    stop_words.update(["mr", "ms", "mrs", "dr", "film", "movie", "really", "one", "TV"])  # Add more stop words as needed
    text = ' '.join(word for word in text.split() if word.lower() not in stop_words or word.lower() == "not")

    # Remove 1-2 length words
    text = ' '.join(word for word in text.split() if len(word) > 2)

    # Remove double spaces
    text = re.sub(r'\s+', ' ', text)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())

    return text

In [3]:
def load_data(path="Dataset/Movies_and_TV_clean.pkl"):
    df = pd.read_pickle(path)
    # Remove half of the reviews
    df = df.sample(frac=0.5, random_state=0)
    df = df.sample(frac=0.5, random_state=42)
    df['reviewText'] = df['reviewText'].apply(clean_text)

    reviews = np.array(df['reviewText'])
    ratings = np.array(df['overall'])
    
    return reviews, ratings

review, sentiment = load_data()
MAX_WORDS = 400  # Maximum number of words allowed in a review

# Cut reviews with more than 400 words to 400 words
review = [r[:MAX_WORDS] if len(r.split()) > MAX_WORDS else r for r in review]

In [4]:
from transformers import BertTokenizer
import tensorflow as tf
from keras.utils import to_categorical

def preprocess_data(reviews, sentiment):
    # Initialize the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
    vocab_size = len(tokenizer.vocab)

    # Tokenization and Conversion to Input IDs
    input_ids = []
    max_sequence_length = 400  # Set your desired sequence length

    for review in reviews:
        encoded_dict = tokenizer.encode_plus(
            review,
            truncation=True,
            max_length=max_sequence_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='tf'
        )
        input_ids.append(encoded_dict['input_ids'])

    input_ids = tf.concat(input_ids, axis=0)

    # Encoding Labels
    sentiment_encoded = to_categorical(np.array(sentiment) - 1) # Convert sentiment labels to categorical (0-based index)

    return np.array(input_ids), sentiment_encoded, vocab_size

input_ids, sentiment_encoded, vocab_size = preprocess_data(review, sentiment)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    input_ids, sentiment_encoded,
    test_size=0.2, stratify=sentiment_encoded
)

In [6]:
from keras import backend as K
def star_mae(y_true, y_pred):
    true_star = K.sum(y_true * K.arange(0, 5, dtype="float32"), axis=-1)
    pred_star = K.sum(y_pred * K.arange(0, 5, dtype="float32"), axis=-1)
    mae = K.mean(K.abs(true_star - pred_star))
    return mae

In [11]:
from BERT_Keras import BERTTF
seq_len = 400
num_layers = 4
num_heads = 4
key_dim = 64
ff_dim = 320
dropout = 0.1
num_class = 5
vocab_size = 30522

model = BERTTF(num_layers, num_heads, seq_len, key_dim, ff_dim, vocab_size, num_class, dropout = dropout)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=star_mae)
model.summary()
# model.load_weights('Weights/Sentiment Analysis Transformer_Build.h5')  

Model: "BERTTF"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 BERTTF_input (InputLayer)   [(None, 400)]             0         
                                                                 
 bert_embedding_tf_1 (BERTEm  (None, 400, 64)          1953408   
 beddingTF)                                                      
                                                                 
 sequential_4 (Sequential)   (None, 400, 64)           107968    
                                                                 
 sequential_5 (Sequential)   (None, 400, 64)           107968    
                                                                 
 sequential_6 (Sequential)   (None, 400, 64)           107968    
                                                                 
 sequential_7 (Sequential)   (None, 400, 64)           107968    
                                                            

In [8]:
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(
    monitor='star_mae', 
    patience=2, 
    mode='min'
)

In [12]:
model.fit(x_train, y_train,
          epochs=5, 
          verbose=1,
          validation_data=(x_test, y_test),
          callbacks=[early_stopping]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1d8b720c5e0>

In [13]:
#Save model
model.save_weights('Weights/Sentiment Analysis Transformer_Build.h5')  

In [5]:
from transformers import BertTokenizer

def preprocessing_data(review, tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')):
    encoded_dict = tokenizer.encode_plus(
            review,
            truncation=True,
            max_length=400,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='tf'
        )
    
    input_id = np.array(encoded_dict['input_ids'])
    
    return input_id

In [8]:
star = [1, 2, 3, 4, 5]
text = input()
text = clean_text(text)
input_id = preprocessing_data(text)
result = np.argmax(model.predict(input_id))
star[result]



5