In [7]:
%pip install nltk huggingface datasets huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Importing the Natural Language Toolkit (nltk) library
import nltk

# Downloading the VADER lexicon, which is used for sentiment analysis
nltk.download("vader_lexicon")


In [3]:
# Importing the SentimentIntensityAnalyzer class from the nltk.sentiment.vader module
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initializing an instance of SentimentIntensityAnalyzer to perform sentiment analysis
sa = SentimentIntensityAnalyzer()


The **SentimentIntensityAnalyzer** uses a pre-built lexicon that contains a list of words associated with sentiment scores. It applies rules based on this lexicon to determine how positive, negative, or neutral a text is. 


In [4]:
text = "this is a very good product"
sa.polarity_scores(text)

{'neg': 0.0, 'neu': 0.556, 'pos': 0.444, 'compound': 0.4927}

In [5]:
# Importing the 'load_dotenv' function from the 'dotenv' module to read environment variables from a .env file
from dotenv import load_dotenv

# Importing the 'os' module to interact with the operating system (for environment variables)
import os

# Loading environment variables from a .env file (which should be in the same directory or specified path)
load_dotenv()

# Retrieving the value of the 'HUGGINGFACE_TOKEN' environment variable
Hugginface_token = os.getenv('HUGGINGFACE_TOKEN')


In [None]:
# Importing the 'login' function from the 'huggingface_hub' module to authenticate with the Hugging Face Hub
from huggingface_hub import login

# Logging into the Hugging Face Hub using the Hugging Face API token retrieved earlier
login(token=Hugginface_token)


In [9]:
# Importing the 'load_dataset' function from the 'datasets' module to load a dataset
from datasets import load_dataset

# Loading the IMDB dataset using the 'load_dataset' function
imdb_dataset = load_dataset("imdb")

# Displaying the loaded dataset
imdb_dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [10]:
import numpy as np  # Importing NumPy for numerical operations

# Generating 10 random integers between 0 and the length of the 'train' dataset in cnn_dataset
rand_int = np.random.randint(len(imdb_dataset['train']), size=10)

# Output the generated random integers
rand_int


array([ 7472, 13226,  6012, 13338, 19104, 17832,  4097, 15956, 21091,
        5808])

In [14]:
# Extracting a subset of data from the 'imdb_dataset' for 'train' based on the random indices in 'rand_int'
summary_data = imdb_dataset['train'][rand_int]['text']

# Display the extracted 'summary_data', which contains the 'text' field for the randomly selected entries
summary_data


['Before I give Spike Lee\'s mess of a film SUMMER OF SAM a well-deserved thrashing, I would like to make one thing clear. I do not revile this film simply for its abundance of sleazy and unpleasant images. What makes this film so unwatchable is the fact that Lee seems to believe that SUMMER OF SAM should be taken seriously as a socially enlightening drama. The crime caper films of Quentin Tarantino, for example, are filled with violence, profanity, and other sleaze, but are nonetheless highly watchable because Tarantino does not attempt to pass these films off as socially redeeming works of art. He knows that such films are for entertainment value only. On the other hand, serious dramas such as SAVING PRIVATE RYAN and SCHINDLER\'S LIST are often unpleasant to watch, but the unpleasantness serves to develop the film\'s plot and characters, with the end goal of getting the audience emotionally involved with the story and characters onscreen. SUMMER OF SAM, unfortunately, merely wallows 

In [15]:
import pandas as pd  # Importing the pandas library for data manipulation and analysis

# Creating a DataFrame from a dictionary, where the key 'comment' is mapped to the variable 'summary_data'
df = pd.DataFrame({"comment": summary_data})

# Displaying the first 5 rows of the DataFrame to inspect the data
df.head()


Unnamed: 0,comment
0,Before I give Spike Lee's mess of a film SUMME...
1,"Richard Chamberlain is David Burton, a tax law..."
2,"This norwegian movie is so crap, the actors ca..."
3,"A long time ago, in a galaxy far, far away......."
4,Methinks the best screen version of Quo Vadis?...


In [17]:
# Looping through each comment in 'summary_data' to compute sentiment scores
for comment in summary_data:
    # Using the sentiment analyzer to get the polarity scores for each comment
    score = sa.polarity_scores(comment)  # 'sa' is assumed to be a sentiment analysis object (e.g., from VADER)

# Applying the sentiment analysis function to each row in the DataFrame 'df' 
# and creating a new column 'score' to store the sentiment scores for each 'comment'
df['score'] = df['comment'].apply(lambda comment: sa.polarity_scores(str(comment)))
df.head()

Unnamed: 0,comment,score
0,Before I give Spike Lee's mess of a film SUMME...,"{'neg': 0.166, 'neu': 0.724, 'pos': 0.109, 'co..."
1,"Richard Chamberlain is David Burton, a tax law...","{'neg': 0.118, 'neu': 0.797, 'pos': 0.085, 'co..."
2,"This norwegian movie is so crap, the actors ca...","{'neg': 0.057, 'neu': 0.761, 'pos': 0.182, 'co..."
3,"A long time ago, in a galaxy far, far away.......","{'neg': 0.074, 'neu': 0.776, 'pos': 0.149, 'co..."
4,Methinks the best screen version of Quo Vadis?...,"{'neg': 0.046, 'neu': 0.737, 'pos': 0.217, 'co..."


In [18]:
# Extracting the 'neg' (negative sentiment score) from the 'score' dictionary for each row
df['negScore'] = df['score'].apply(lambda score_dict: score_dict['neg'])

# Display the first 5 rows of the DataFrame to inspect the 'negScore' column
df.head()


Unnamed: 0,comment,score,negScore
0,Before I give Spike Lee's mess of a film SUMME...,"{'neg': 0.166, 'neu': 0.724, 'pos': 0.109, 'co...",0.166
1,"Richard Chamberlain is David Burton, a tax law...","{'neg': 0.118, 'neu': 0.797, 'pos': 0.085, 'co...",0.118
2,"This norwegian movie is so crap, the actors ca...","{'neg': 0.057, 'neu': 0.761, 'pos': 0.182, 'co...",0.057
3,"A long time ago, in a galaxy far, far away.......","{'neg': 0.074, 'neu': 0.776, 'pos': 0.149, 'co...",0.074
4,Methinks the best screen version of Quo Vadis?...,"{'neg': 0.046, 'neu': 0.737, 'pos': 0.217, 'co...",0.046


In [19]:
# Extracting the 'compound' (overall sentiment score) from the 'score' dictionary for each row
df['compound'] = df['score'].apply(lambda score_dict: score_dict['compound'])

# Display the first 5 rows of the DataFrame to inspect the 'compound' column
df.head()


Unnamed: 0,comment,score,negScore,compound
0,Before I give Spike Lee's mess of a film SUMME...,"{'neg': 0.166, 'neu': 0.724, 'pos': 0.109, 'co...",0.166,-0.9851
1,"Richard Chamberlain is David Burton, a tax law...","{'neg': 0.118, 'neu': 0.797, 'pos': 0.085, 'co...",0.118,-0.9686
2,"This norwegian movie is so crap, the actors ca...","{'neg': 0.057, 'neu': 0.761, 'pos': 0.182, 'co...",0.057,0.7526
3,"A long time ago, in a galaxy far, far away.......","{'neg': 0.074, 'neu': 0.776, 'pos': 0.149, 'co...",0.074,0.9902
4,Methinks the best screen version of Quo Vadis?...,"{'neg': 0.046, 'neu': 0.737, 'pos': 0.217, 'co...",0.046,0.9851


In [20]:
# Assigning the label 'positive' to rows where the 'compound' score is greater than 0
df.loc[df.compound > 0, 'type'] = "positive"

# Assigning the label 'Neutral' to rows where the 'compound' score is exactly 0
df.loc[df.compound == 0, 'type'] = "Neutral"

# Assigning the label 'negative' to rows where the 'compound' score is less than 0
df.loc[df.compound < 0, 'type'] = "negative"

# Display the first 5 rows of the DataFrame to inspect the 'type' column
df.head()


Unnamed: 0,comment,score,negScore,compound,type
0,Before I give Spike Lee's mess of a film SUMME...,"{'neg': 0.166, 'neu': 0.724, 'pos': 0.109, 'co...",0.166,-0.9851,negative
1,"Richard Chamberlain is David Burton, a tax law...","{'neg': 0.118, 'neu': 0.797, 'pos': 0.085, 'co...",0.118,-0.9686,negative
2,"This norwegian movie is so crap, the actors ca...","{'neg': 0.057, 'neu': 0.761, 'pos': 0.182, 'co...",0.057,0.7526,positive
3,"A long time ago, in a galaxy far, far away.......","{'neg': 0.074, 'neu': 0.776, 'pos': 0.149, 'co...",0.074,0.9902,positive
4,Methinks the best screen version of Quo Vadis?...,"{'neg': 0.046, 'neu': 0.737, 'pos': 0.217, 'co...",0.046,0.9851,positive
