In [3]:
#!pip install datasets

Dataset Summary
Movie Review Dataset. This is a dataset of containing 5,331 positive and 5,331 negative processed sentences from Rotten Tomatoes movie reviews. This data was first used in Bo Pang and Lillian Lee, ``Seeing stars: Exploiting class relationships for sentiment categorization with respect to rating scales.'', Proceedings of the ACL, 2005.

In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [17]:
import pandas as pd
df = pd.DataFrame(dataset["train"])
print(df.head())
print(df.shape)
print(df.dtypes)

                                                text  label
0  the rock is destined to be the 21st century's ...      1
1  the gorgeously elaborate continuation of " the...      1
2                     effective but too-tepid biopic      1
3  if you sometimes like to go to the movies to h...      1
4  emerges as something rare , an issue movie tha...      1
(8530, 2)
text     object
label     int64
dtype: object


In [6]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [7]:
df.isna().sum()

text     0
label    0
dtype: int64

In [16]:
# search for duplicates

df["text"].value_counts()

the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .                                     1
conforms itself with creating a game of 'who's who' . . . where the characters' moves are often more predictable than their consequences .                                                                            1
it's a film with an idea buried somewhere inside its fabric , but never clearly seen or felt .                                                                                                                        1
a horror movie with seriously dumb characters , which somewhat dilutes the pleasure of watching them stalked by creepy-crawly bug things that live only in the darkness .                                             1
pap invested in undergraduate doubling subtexts and ridiculous stabs at existentialism reminding of the discovery of the wizard of god i

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocessing(text):     
    # Tokenize the new text using NLTK
    new_words = word_tokenize(text)
     
    # Remove stopwords using NLTK
    new_filtered_words = [
        word for word in new_words if word.lower() not in stopwords.words('english')]
     
    # Join the filtered words to form a clean text
    new_clean_text = ' '.join(new_filtered_words)
    return new_clean_text

text = df["text"][0]
print("Original Text:", text)
new_clean_text = preprocessing(text)
print("Text after Stopword Removal:", new_clean_text)

Original Text: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
Text after Stopword Removal: rock destined 21st century 's new `` conan `` 's going make splash even greater arnold schwarzenegger , jean-claud van damme steven segal .


In [19]:
df["text"] = df["text"].apply(preprocessing)
df.head()

Unnamed: 0,text,label
0,rock destined 21st century 's new `` conan `` ...,1
1,gorgeously elaborate continuation `` lord ring...,1
2,effective too-tepid biopic,1
3,"sometimes like go movies fun , wasabi good pla...",1
4,"emerges something rare , issue movie 's honest...",1


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfV = TfidfVectorizer(max_features = 1000, stop_words = 'english')
vecTor=  tfidfV.fit_transform(df["text"])
print(df.shape)
print(vecTor.shape)

(8530, 2)
(8530, 1000)


In [14]:
print(vecTor)

  (0, 721)	0.4474716137670591
  (0, 103)	0.46834078770650434
  (0, 578)	0.3307567469796541
  (0, 359)	0.3849265892221217
  (0, 514)	0.31219598482886696
  (0, 815)	0.47475385594763736
  (1, 412)	0.38450556007151104
  (1, 980)	0.37884836214972395
  (1, 989)	0.31158285934143953
  (1, 206)	0.2425921087111874
  (1, 627)	0.38757460788291465
  (1, 449)	0.37884836214972395
  (1, 941)	0.36905883971096276
  (1, 542)	0.35035695794191396
  (2, 228)	1.0
  (3, 489)	0.28795883654000126
  (3, 563)	0.38384800275006165
  (3, 336)	0.39722895386713963
  (3, 361)	0.3352443697769443
  (3, 632)	0.4714126930796241
  (3, 812)	0.5266221252370725
  (4, 489)	0.29419888672668854
  (4, 689)	0.4945876143555651
  (4, 562)	0.2434748532773989
  (4, 405)	0.5267004186928492
  :	:
  (8525, 626)	0.43905388926899785
  (8525, 109)	0.307872298893213
  (8525, 730)	0.4426722043856351
  (8525, 872)	0.3879750959502483
  (8526, 489)	0.26289706866781365
  (8526, 562)	0.21756990970675746
  (8526, 585)	0.46180297422243
  (8526, 499)	