In [None]:
!pip install tensorflow-datasets

In [None]:
# Install tensorflow-datasets
import tensorflow_datasets as tfds

# Load the IMDB dataset
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

# Split the dataset into training and testing sets
train_dataset, test_dataset = dataset['train'], dataset['test']

# Print dataset information
print(info)

# Example: Iterate over the first 5 examples in the training set
for example, label in train_dataset.take(5):
    print("Review:", example.numpy().decode('utf-8'))
    print("Label:", label.numpy())

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.ES4Z76_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.ES4Z76_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.ES4Z76_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.
tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir=PosixGPath('/tmp/tmp8xqv_fratfds'),
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text

In [None]:
# convert into pandas df
import pandas as pd

def convert_to_dataframe(dataset):
    reviews = []
    labels = []
    for example, label in tfds.as_numpy(dataset):
        reviews.append(example.decode('utf-8'))
        labels.append(label)
    df = pd.DataFrame({'review': reviews, 'label': labels})
    return df

train_df = convert_to_dataframe(train_dataset)
# Truncate train_df to 1000 rows
train_df = train_df.head(1000)
train_df.shape
print(train_df.head())

                                              review  label
0  This was an absolutely terrible movie. Don't b...      0
1  I have been known to fall asleep during films,...      0
2  Mann photographs the Alberta Rocky Mountains i...      0
3  This is the kind of film for a snowy Sunday af...      1
4  As others have mentioned, all the women that g...      1


In [None]:

#Special characters removal
import re

def remove_special_characters(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

train_df['cleaned_review'] = train_df['review'].apply(remove_special_characters)
print(train_df.head())

                                              review  label  \
0  This was an absolutely terrible movie. Don't b...      0   
1  I have been known to fall asleep during films,...      0   
2  Mann photographs the Alberta Rocky Mountains i...      0   
3  This is the kind of film for a snowy Sunday af...      1   
4  As others have mentioned, all the women that g...      1   

                                      cleaned_review  
0  This was an absolutely terrible movie Dont be ...  
1  I have been known to fall asleep during films ...  
2  Mann photographs the Alberta Rocky Mountains i...  
3  This is the kind of film for a snowy Sunday af...  
4  As others have mentioned all the women that go...  


In [None]:
#convert to lowercase then tokenize and apply stop words
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt') # Download the Punkt Sentence Tokenizer
nltk.download('punkt_tab') # Download the Punkt Sentence Tokenizer models
nltk.download('stopwords') # Download stopwords

# Convert to lowercase
train_df['review_lower'] = train_df['cleaned_review'].str.lower()

# Tokenize the reviews
train_df['tokenized_review'] = train_df['review_lower'].apply(word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))
train_df['filtered_tokens'] = train_df['tokenized_review'].apply(lambda tokens: [word for word in tokens if word not in stop_words])


print(train_df.head(1))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review  label  \
0  This was an absolutely terrible movie. Don't b...      0   

                                      cleaned_review  \
0  This was an absolutely terrible movie Dont be ...   

                                        review_lower  \
0  this was an absolutely terrible movie dont be ...   

                                    tokenized_review  \
0  [this, was, an, absolutely, terrible, movie, d...   

                                     filtered_tokens  
0  [absolutely, terrible, movie, dont, lured, chr...  


In [None]:
#stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_tokens(tokens):
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

train_df['stemmed_tokens'] = train_df['filtered_tokens'].apply(stem_tokens)

print(train_df.head(2))


                                              review  label  \
0  This was an absolutely terrible movie. Don't b...      0   
1  I have been known to fall asleep during films,...      0   

                                      cleaned_review  \
0  This was an absolutely terrible movie Dont be ...   
1  I have been known to fall asleep during films ...   

                                        review_lower  \
0  this was an absolutely terrible movie dont be ...   
1  i have been known to fall asleep during films ...   

                                    tokenized_review  \
0  [this, was, an, absolutely, terrible, movie, d...   
1  [i, have, been, known, to, fall, asleep, durin...   

                                     filtered_tokens  \
0  [absolutely, terrible, movie, dont, lured, chr...   
1  [known, fall, asleep, films, usually, due, com...   

                                      stemmed_tokens  
0  [absolut, terribl, movi, dont, lure, christoph...  
1  [known, fall, asleep,

In [None]:
# lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

train_df['lemmatized_tokens'] = train_df['filtered_tokens'].apply(lemmatize_tokens)

print(train_df.head(2))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                              review  label  \
0  This was an absolutely terrible movie. Don't b...      0   
1  I have been known to fall asleep during films,...      0   

                                      cleaned_review  \
0  This was an absolutely terrible movie Dont be ...   
1  I have been known to fall asleep during films ...   

                                        review_lower  \
0  this was an absolutely terrible movie dont be ...   
1  i have been known to fall asleep during films ...   

                                    tokenized_review  \
0  [this, was, an, absolutely, terrible, movie, d...   
1  [i, have, been, known, to, fall, asleep, durin...   

                                     filtered_tokens  \
0  [absolutely, terrible, movie, dont, lured, chr...   
1  [known, fall, asleep, films, usually, due, com...   

                                      stemmed_tokens  \
0  [absolut, terribl, movi, dont, lure, christoph...   
1  [known, fall, aslee

In [None]:
# apply trigram and bag of words wit count vectorizer

import nltk
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer

# Function to generate trigrams
def generate_trigrams(tokens):
    return list(ngrams(tokens, 3))

train_df['trigrams'] = train_df['lemmatized_tokens'].apply(generate_trigrams)
print(train_df[['lemmatized_tokens', 'trigrams']].head())

# Bag of words using CountVectorizer
# Join the lemmatized tokens back into strings for CountVectorizer
train_df['lemmatized_string'] = train_df['lemmatized_tokens'].apply(lambda tokens: ' '.join(tokens))

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train_df['lemmatized_string'])

# Get feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Convert to dense array for easier viewing (optional)
X_dense = X.toarray()
print(X_dense)
print(f"Bag of words features: {feature_names[:20]}...") # Print the first 20 feature names


# Example usage to access the count of a specific word:
word_index = vectorizer.vocabulary_.get('good') # find index of word 'good'
if word_index is not None:
  print(f"Count of 'good' in the first document: {X[0,word_index]}")



                                   lemmatized_tokens  \
0  [absolutely, terrible, movie, dont, lured, chr...   
1  [known, fall, asleep, film, usually, due, comb...   
2  [mann, photograph, alberta, rocky, mountain, s...   
3  [kind, film, snowy, sunday, afternoon, rest, w...   
4  [others, mentioned, woman, go, nude, film, mos...   

                                            trigrams  
0  [(absolutely, terrible, movie), (terrible, mov...  
1  [(known, fall, asleep), (fall, asleep, film), ...  
2  [(mann, photograph, alberta), (photograph, alb...  
3  [(kind, film, snowy), (film, snowy, sunday), (...  
4  [(others, mentioned, woman), (mentioned, woman...  
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Bag of words features: ['aaaaah' 'aaah' 'aargh' 'aaron' 'abandon' 'abandoned' 'abbas' 'abbot'
 'abc' 'abduct' 'abducted' 'abducting' 'abe' 'abel' 'abhijeetrehan'
 'abhishek' 'abhorrent' 'abiding' 'ability' 'abilit

In [None]:
#apply tfidf

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(train_df['lemmatized_string'])

# Get feature names (vocabulary)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert to dense array for easier viewing (optional)
tfidf_matrix_dense = tfidf_matrix.toarray()
print(tfidf_matrix_dense)

print(f"TF-IDF features: {feature_names[:20]}...") # Print the first 20 feature names

# Example usage to access the TF-IDF score of a specific word:
word_index = tfidf_vectorizer.vocabulary_.get('good') # find index of word 'good'
if word_index is not None:
  print(f"TF-IDF score of 'good' in the first document: {tfidf_matrix[0,word_index]}")

tfidf_df = pd.DataFrame(tfidf_matrix_dense, columns=feature_names)
print(tfidf_df)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
TF-IDF features: ['aaaaah' 'aaah' 'aargh' 'aaron' 'abandon' 'abandoned' 'abbas' 'abbot'
 'abc' 'abduct' 'abducted' 'abducting' 'abe' 'abel' 'abhijeetrehan'
 'abhishek' 'abhorrent' 'abiding' 'ability' 'abilitybr']...
TF-IDF score of 'good' in the first document: 0.04901499913639396
     aaaaah  aaah  aargh  aaron  abandon  abandoned  abbas  abbot  abc  \
0       0.0   0.0    0.0    0.0      0.0        0.0    0.0    0.0  0.0   
1       0.0   0.0    0.0    0.0      0.0        0.0    0.0    0.0  0.0   
2       0.0   0.0    0.0    0.0      0.0        0.0    0.0    0.0  0.0   
3       0.0   0.0    0.0    0.0      0.0        0.0    0.0    0.0  0.0   
4       0.0   0.0    0.0    0.0      0.0        0.0    0.0    0.0  0.0   
..      ...   ...    ...    ...      ...        ...    ...    ...  ...   
995     0.0   0.0    0.0    0.0      0.0    