## Data Preprocessing

In [31]:
import pandas as pd

In [32]:
# Turn .csv files into pandas DataFrame's
train_df = pd.read_csv('C:/Users/DIVYA/Desktop/Movie_review/train_modified.csv')
train_df = train_df[train_df['text'].notnull()]
train_df = train_df.sample(frac=0.1)
train_df.head()


Unnamed: 0,text,target
979502,It will keep a strong-willed Beagle in her place.,1
918541,Barbie Fairytopia,1
829265,A Must Have,1
633450,Rawk,1
646816,earphone,0


In [33]:
test_df = pd.read_csv('C:/Users/DIVYA/Desktop/Movie_review/test_modified.csv')
test_df = test_df[test_df['text'].notnull()]
test_df = test_df.sample(frac=0.1)
test_df.head()

Unnamed: 0,text
264216,Dangerous and not for kids
384419,Very Disappointed
160605,"Good supplemental reading, but rely on the PMBOK!"
227944,Bored
243803,Beginners Woodcarving


In [34]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,text,target
819939,Please describe the content!,0
258431,The Metaphysical Foundations of Logic,1
451583,Confused,0
548303,"best ""natural"" moisturizer around",1
925950,...about crazy birds,1


### Below code is used to find the count of the values.

In [5]:
# How many examples of each class?
train_df.target.value_counts()

1    53011
0    51844
Name: target, dtype: int64

Since we have two target values, we're dealing with a binary classification problem.

It's fairly balanced too, about 50% negative class (target = 0) and 50% positive class (target = 1).

Where,

1 = Positive Review
0 = Negative Review
And what about the total number of samples we have?

### Total number of samples

In [35]:
# How many samples total?
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

Total training samples: 104855
Total test samples: 39999
Total samples: 144854


### Visualise the 10 random training examples

In [36]:
# Let's visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5) # create random indexes not higher than the total number of samples
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(Positive Review)" if target > 0 else "(Negative Review)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 1 (Positive Review)
Text:
Excelleent

---

Target: 0 (Negative Review)
Text:
hunk of gunk

---

Target: 0 (Negative Review)
Text:
spongbob NOT

---

Target: 0 (Negative Review)
Text:
its awful

---

Target: 0 (Negative Review)
Text:
The film is rudderless

---



### Split data into training and validation sets

In [37]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

In [38]:
val_sentences

array(["Too bad I can't rate 0 stars", 'A Green Tea with Flavor',
       'Wrong fit for my truck', ...,
       'People really believe this stuff, kinda scary',
       'Oftern imitated never Duplicated',
       "1984? I don't think so. Brave New World is weak."], dtype=object)

In [40]:
train_sentences

array(['so bad', 'Propolis soothes', 'A Truely Original Band', ...,
       'Still my favorite tale', 'Not just another CD',
       'For Simpsons fans.'], dtype=object)

In [9]:
# Check the lengths
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(94369, 94369, 10486, 10486)

In [10]:
# View the first 10 training sentences and their labels
train_sentences[:10], train_labels[:10]

(array(['Brave', 'Nice water heater', 'its a BUST',
        'The plot was funny and the cast did a great job of drawing y',
        'Also positive', 'Exciting', 'Arguably worse than "Plan 9"',
        "This band, like an STD, just doesn't seem to go away..",
        'An underground classic', 'Put me out of my misery.....'],
       dtype=object),
 array([1, 1, 0, 1, 1, 1, 0, 0, 1, 0], dtype=int64))

## Converting text into numbers

### Text vectorization (tokenization)

In [11]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization", see https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0 for more

# Use the default TextVectorization variables
text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?

In [12]:
# Find average number of tokens (words) in training reviews
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

4

In [13]:
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a review does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

Commented below because it is taking 10 mins to fit the data

In [14]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [15]:
# # Create sample sentence and tokenize it
# sample_sentence = "There's a flood in my street!"
# text_vectorizer([sample_sentence])

In [16]:
# # Choose a random sentence from the training dataset and tokenize it
# random_sentence = random.choice(train_sentences)
# print(f"Original text:\n{random_sentence}\
#       \n\nVectorized version:")
# text_vectorizer([random_sentence])

In [17]:
# # Get the unique words in the vocabulary
# words_in_vocab = text_vectorizer.get_vocabulary()
# top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
# bottom_5_words = words_in_vocab[-5:] # least common tokens
# print(f"Number of words in vocab: {len(words_in_vocab)}")
# print(f"Top 5 most common words: {top_5_words}") 
# print(f"Bottom 5 least common words: {bottom_5_words}")

### Creating an Embedding using an Embedding Layer

In [18]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

embedding

<keras.layers.embeddings.Embedding at 0x2034b4fa188>

In [19]:
# # Get a random sentence from training set
# random_sentence = random.choice(train_sentences)
# print(f"Original text:\n{random_sentence}\
#       \n\nEmbedded version:")

# # Embed the random sentence (turn it into numerical representation)
# sample_embed = embedding(text_vectorizer([random_sentence]))
# sample_embed

In [20]:
# # Check out a single token's embedding
# sample_embed[0][0]

## Model 0: Naive Bayes with TF-IDF Encoder (baseline)


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [22]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 80.26%


In [23]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0],
      dtype=int64)

### Creating an evaluation function for our model experiments

In [24]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [25]:
# Get baseline results
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 80.25939347701697,
 'precision': 0.8026476329527267,
 'recall': 0.8025939347701697,
 'f1': 0.8025506294699017}

In [27]:
import pickle
with open('model_0.pkl', 'wb') as fid:
    pickle.dump(model_0, fid,2) 

In [29]:
#Create a Dataframe 
cat = test_df
index_dict = dict(zip(cat.columns,range(cat.shape[1])))

In [30]:
with open('cat', 'wb') as fid:
    pickle.dump(index_dict, fid,2)