In [4]:
!pip install afinn

Collecting afinn
  Downloading afinn-0.1.tar.gz (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: afinn
  Building wheel for afinn (setup.py) ... [?25ldone
[?25h  Created wheel for afinn: filename=afinn-0.1-py3-none-any.whl size=53429 sha256=da3ae55b7842fcef490683633d0b4584d840f3fe926cc83645a969170aec1fb7
  Stored in directory: /root/.cache/pip/wheels/b0/05/90/43f79196199a138fb486902fceca30a2d1b5228e6d2db8eb90
Successfully built afinn
Installing collected packages: afinn
Successfully installed afinn-0.1


------------------------ Afinn with stop word removal---------------------------------

In [19]:
from afinn import Afinn
import nltk
import time
from nltk.corpus import stopwords
import pandas as pd
from sklearn.metrics import accuracy_score

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Instantiate the AFINN object
afinn = Afinn()

# Remove stopwords
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    words = review.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Function to predict sentiment
def predict_sentiment(review):
    text_without_stopwords = remove_stopwords(review)
    sentiment_score = afinn.score(text_without_stopwords)
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"


test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")
# Start timer
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# End timer and calculate time taken
end_time = time.time()


# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical
print("Missing values in Predicted_Sentiment column:", test_data['Predicted_Sentiment'].isnull().sum())
print("Missing values in sentiment column:", test_data['sentiment'].isnull().sum())
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Missing values in Predicted_Sentiment column: 0
Missing values in sentiment column: 0
              precision    recall  f1-score   support

           0       0.80      0.53      0.64      9935
           1       0.65      0.87      0.75     10065

    accuracy                           0.70     20000
   macro avg       0.73      0.70      0.69     20000
weighted avg       0.73      0.70      0.69     20000

Time taken: 93.36256527900696 seconds


In [20]:
# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)

Accuracy: 0.70195


------------------------ Afinn with stop word removal + tokenization + pos ---------------------------------


In [21]:
from afinn import Afinn
import time
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.metrics import accuracy_score

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Instantiate the AFINN object
afinn = Afinn()

# Remove stopwords and perform POS tagging
def preprocess(review):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(review)
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # POS tagging
    pos_tags = nltk.pos_tag(filtered_words)
    return pos_tags

# Function to predict sentiment
def predict_sentiment(review):
    pos_tags = preprocess(review)
    sentiment_score = 0
    for word, pos in pos_tags:
        if pos.startswith('JJ'):  # Consider only adjectives for sentiment scoring
            sentiment_score += afinn.score(word)
    if sentiment_score >= 0:
        return "positive"
    else:
        return "negative"

# Read test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")
# Start timer
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)
# End timer and calculate time taken
end_time = time.time()


# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Accuracy: 0.70065
              precision    recall  f1-score   support

           0       0.83      0.50      0.63      9935
           1       0.65      0.90      0.75     10065

    accuracy                           0.70     20000
   macro avg       0.74      0.70      0.69     20000
weighted avg       0.74      0.70      0.69     20000

Time taken: 283.604065656662 seconds


------------------------ Afinn with stop word removal + tokeizatiom ---------------------------------

In [22]:
from afinn import Afinn
import nltk
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.metrics import accuracy_score

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Instantiate the AFINN object
afinn = Afinn()

# Remove stopwords and tokenize
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(review)  # Tokenize the input review
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Function to predict sentiment
def predict_sentiment(review):
    text_without_stopwords = remove_stopwords(review)
    sentiment_score = afinn.score(text_without_stopwords)
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")
# Start timer
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# End timer and calculate time taken
end_time = time.time()


# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Check for missing values
print("Missing values in Predicted_Sentiment column:", test_data['Predicted_Sentiment'].isnull().sum())
print("Missing values in sentiment column:", test_data['sentiment'].isnull().sum())

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Missing values in Predicted_Sentiment column: 0
Missing values in sentiment column: 0
Accuracy: 0.70125
              precision    recall  f1-score   support

           0       0.80      0.53      0.64      9935
           1       0.65      0.87      0.75     10065

    accuracy                           0.70     20000
   macro avg       0.73      0.70      0.69     20000
weighted avg       0.73      0.70      0.69     20000

Time taken: 131.09869074821472 seconds


------------------------ Afinn with stop word removal + tokeizatiom + porter stemmer ---------------------------------

In [23]:
from afinn import Afinn
import nltk
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
from sklearn.metrics import accuracy_score
import re

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Instantiate the AFINN object
afinn = Afinn()

# Initialize Porter Stemmer
porter = PorterStemmer()

# Remove stopwords, punctuation, special characters, and numbers, apply Porter stemming, and tokenize
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)  # Tokenize the input text
    filtered_words = [porter.stem(word.lower()) for word in words if word.lower() not in stop_words and word.isalnum()]  # Remove stopwords, non-alphanumeric characters, apply Porter stemming, and convert to lowercase
    return filtered_words

# Function to predict sentiment
def predict_sentiment(review):
    processed_text = preprocess_text(review)
    text_without_stopwords = ' '.join(processed_text)
    sentiment_score = afinn.score(text_without_stopwords)
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")

# Start timer
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# End timer and calculate time taken
end_time = time.time()


# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Check for missing values
print("Missing values in Predicted_Sentiment column:", test_data['Predicted_Sentiment'].isnull().sum())
print("Missing values in sentiment column:", test_data['sentiment'].isnull().sum())

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Missing values in Predicted_Sentiment column: 0
Missing values in sentiment column: 0
Accuracy: 0.6571
              precision    recall  f1-score   support

           0       0.75      0.46      0.57      9935
           1       0.62      0.85      0.71     10065

    accuracy                           0.66     20000
   macro avg       0.68      0.66      0.64     20000
weighted avg       0.68      0.66      0.64     20000

Time taken: 187.31321787834167 seconds


------------------------ Afinn with stop word removal + tokeizatiom + Lancaster  stemmer ---------------------------------

In [24]:
from nltk.stem import LancasterStemmer
from afinn import Afinn
import nltk
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
from sklearn.metrics import accuracy_score
import re

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Instantiate the AFINN object
afinn = Afinn()

# Initialize Porter Stemmer
porter =  LancasterStemmer()

# Remove stopwords, punctuation, special characters, and numbers, apply Porter stemming, and tokenize
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)  # Tokenize the input text
    filtered_words = [porter.stem(word.lower()) for word in words if word.lower() not in stop_words and word.isalnum()]  # Remove stopwords, non-alphanumeric characters, apply Porter stemming, and convert to lowercase
    return filtered_words

# Function to predict sentiment
def predict_sentiment(review):
    processed_text = preprocess_text(review)
    text_without_stopwords = ' '.join(processed_text)
    sentiment_score = afinn.score(text_without_stopwords)
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")

# Start timer
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)
# End timer and calculate time taken
end_time = time.time()


# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Check for missing values
print("Missing values in Predicted_Sentiment column:", test_data['Predicted_Sentiment'].isnull().sum())
print("Missing values in sentiment column:", test_data['sentiment'].isnull().sum())

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Missing values in Predicted_Sentiment column: 0
Missing values in sentiment column: 0
Accuracy: 0.68435
              precision    recall  f1-score   support

           0       0.71      0.62      0.66      9935
           1       0.67      0.75      0.70     10065

    accuracy                           0.68     20000
   macro avg       0.69      0.68      0.68     20000
weighted avg       0.69      0.68      0.68     20000

Time taken: 163.92923212051392 seconds


------------------------ Afinn with stop word removal + without tokeizatiom + Lancaster  stemmer ---------------------------------

In [25]:
from nltk.stem import LancasterStemmer
from afinn import Afinn
import nltk
import time
from nltk.corpus import stopwords
import pandas as pd
from sklearn.metrics import accuracy_score

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Instantiate the AFINN object
afinn = Afinn()

# Initialize Lancaster Stemmer
lancaster = LancasterStemmer()

# Remove stopwords, punctuation, special characters, and numbers, apply Lancaster stemming
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()  # Split the input text
    filtered_words = [lancaster.stem(word.lower()) for word in words if word.lower() not in stop_words and word.isalnum()]  # Remove stopwords, non-alphanumeric characters, apply Lancaster stemming, and convert to lowercase
    return ' '.join(filtered_words)

# Function to predict sentiment
def predict_sentiment(review):
    processed_text = preprocess_text(review)
    sentiment_score = afinn.score(processed_text)
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")

# Start timer
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)
# End timer and calculate time taken
end_time = time.time()


# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Check for missing values
print("Missing values in Predicted_Sentiment column:", test_data['Predicted_Sentiment'].isnull().sum())
print("Missing values in sentiment column:", test_data['sentiment'].isnull().sum())

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Missing values in Predicted_Sentiment column: 0
Missing values in sentiment column: 0
Accuracy: 0.66795
              precision    recall  f1-score   support

           0       0.69      0.59      0.64      9935
           1       0.65      0.74      0.69     10065

    accuracy                           0.67     20000
   macro avg       0.67      0.67      0.67     20000
weighted avg       0.67      0.67      0.67     20000

Time taken: 96.71896886825562 seconds


------------------------ Afinn with stop word removal + tokeizatiom + snowball  stemmer ---------------------------------

In [26]:
from nltk.stem import LancasterStemmer
from afinn import Afinn
import nltk
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
import pandas as pd
from sklearn.metrics import accuracy_score
import re

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Instantiate the AFINN object
afinn = Afinn()

# Initialize Porter Stemmer
porter = SnowballStemmer('english')

# Remove stopwords, punctuation, special characters, and numbers, apply Porter stemming, and tokenize
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)  # Tokenize the input text
    filtered_words = [porter.stem(word.lower()) for word in words if word.lower() not in stop_words and word.isalnum()]  # Remove stopwords, non-alphanumeric characters, apply Porter stemming, and convert to lowercase
    return filtered_words

# Function to predict sentiment
def predict_sentiment(review):
    processed_text = preprocess_text(review)
    text_without_stopwords = ' '.join(processed_text)
    sentiment_score = afinn.score(text_without_stopwords)
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")
# Start timer
start_time = time.time()

# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# End timer and calculate time taken
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Check for missing values
print("Missing values in Predicted_Sentiment column:", test_data['Predicted_Sentiment'].isnull().sum())
print("Missing values in sentiment column:", test_data['sentiment'].isnull().sum())

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Missing values in Predicted_Sentiment column: 0
Missing values in sentiment column: 0
Accuracy: 0.66065
              precision    recall  f1-score   support

           0       0.76      0.46      0.58      9935
           1       0.62      0.86      0.72     10065

    accuracy                           0.66     20000
   macro avg       0.69      0.66      0.65     20000
weighted avg       0.69      0.66      0.65     20000

Time taken: 160.94838523864746 seconds


with spacy

In [2]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
from afinn import Afinn
import pandas as pd
import spacy
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Download spaCy model for English language
spacy.cli.download("en_core_web_sm")

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Instantiate the AFINN object
afinn = Afinn()

# Function to preprocess text
def preprocess(review):
    # Tokenize the review
    words = word_tokenize(review)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    
    # Lemmatize using spaCy
    lemmatized_words = [token.lemma_ for token in nlp(" ".join(filtered_words))]
    
    return lemmatized_words

# Function to predict sentiment
def predict_sentiment(review):
    words = preprocess(review)
    sentiment_score = sum(afinn.score(word) for word in words)
    if sentiment_score >= 0:
        return "positive"
    else:
        return "negative"

# Read test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")

# Start timer
start_time = time.time()

# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)

# End timer and calculate time taken
end_time = time.time()
print("Time taken:", end_time - start_time, "seconds")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Accuracy: 0.70485
Time taken: 738.4201760292053 seconds


In [6]:
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))

              precision    recall  f1-score   support

           0       0.80      0.54      0.64      9935
           1       0.66      0.87      0.75     10065

    accuracy                           0.70     20000
   macro avg       0.73      0.70      0.70     20000
weighted avg       0.73      0.70      0.70     20000



--------------------------------------- Testblob with stop word removal --------------------------------

In [27]:
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
import pandas as pd
import time
from sklearn.metrics import accuracy_score

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Remove stopwords
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    words = review.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Function to predict sentiment
def predict_sentiment(review):
    text_without_stopwords = remove_stopwords(review)
    blob = TextBlob(text_without_stopwords)
    sentiment_score = blob.sentiment.polarity
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")

# Start timer
start_time = time.time()

# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# End timer and calculate time taken
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)

from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Accuracy: 0.70535
              precision    recall  f1-score   support

           0       0.88      0.47      0.61      9935
           1       0.64      0.94      0.76     10065

    accuracy                           0.71     20000
   macro avg       0.76      0.70      0.69     20000
weighted avg       0.76      0.71      0.69     20000

Time taken: 31.312926054000854 seconds


--------------------------------------- Testblob with stop word removal + tokeizatiom ---------------------------------

In [28]:
from textblob import TextBlob
import nltk
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.metrics import accuracy_score

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Remove stopwords and tokenize
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(review)  # Tokenize the input review
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Function to predict sentiment
def predict_sentiment(review):
    text_without_stopwords = remove_stopwords(review)
    blob = TextBlob(text_without_stopwords)
    sentiment_score = blob.sentiment.polarity
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")

# Start timer
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)
# End timer and calculate time taken
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Accuracy: 0.7099
              precision    recall  f1-score   support

           0       0.89      0.48      0.62      9935
           1       0.65      0.94      0.77     10065

    accuracy                           0.71     20000
   macro avg       0.77      0.71      0.69     20000
weighted avg       0.77      0.71      0.69     20000

Time taken: 75.81169414520264 seconds


--------------------------------------- Testblob with stop word removal + tokeizatiom + pos ---------------------------------

In [29]:
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
import time
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.metrics import accuracy_score

# Download stopwords and punkt tokenizer if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Remove stopwords, tokenize, and perform POS tagging
def preprocess_text(review):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(review)  # Tokenize the input review
    words_pos = nltk.pos_tag(words)  # Perform POS tagging
    filtered_words = [word for word, pos in words_pos if word.lower() not in stop_words and pos.startswith('N')]  # Remove stopwords and keep only nouns
    return ' '.join(filtered_words)

# Function to predict sentiment
def predict_sentiment(review):
    text_without_stopwords = preprocess_text(review)
    blob = TextBlob(text_without_stopwords)
    sentiment_score = blob.sentiment.polarity
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")

# Start timer
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# End timer and calculate time taken
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Accuracy: 0.60855
              precision    recall  f1-score   support

           0       0.65      0.46      0.54      9935
           1       0.59      0.76      0.66     10065

    accuracy                           0.61     20000
   macro avg       0.62      0.61      0.60     20000
weighted avg       0.62      0.61      0.60     20000

Time taken: 374.55364751815796 seconds


--------------------------------------- Testblob with stop word removal + tokeizatiom + pos + porter stemmer ---------------------------------

In [30]:
from textblob import TextBlob
import nltk
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
from sklearn.metrics import accuracy_score

# Download stopwords and punkt tokenizer if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Initialize Porter Stemmer
porter = PorterStemmer()

# Remove stopwords, tokenize, and apply Porter stemming
def preprocess_text(review):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(review)  # Tokenize the input review
    filtered_words = [porter.stem(word.lower()) for word in words if word.lower() not in stop_words]  # Remove stopwords and apply Porter stemming
    return ' '.join(filtered_words)

# Function to predict sentiment
def predict_sentiment(review):
    text_without_stopwords = preprocess_text(review)
    blob = TextBlob(text_without_stopwords)
    sentiment_score = blob.sentiment.polarity
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")
# Start timer
start_time = time.time()

# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# End timer and calculate time taken
end_time = time.time()
# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Accuracy: 0.65375
              precision    recall  f1-score   support

           0       0.80      0.40      0.53      9935
           1       0.60      0.90      0.72     10065

    accuracy                           0.65     20000
   macro avg       0.70      0.65      0.63     20000
weighted avg       0.70      0.65      0.63     20000

Time taken: 146.8269078731537 seconds


--------------------------------------- Testblob with stop word removal + tokeizatiom + pos +  LancasterStemmer ---------------------------------


In [31]:
from textblob import TextBlob
import nltk
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
import pandas as pd
from sklearn.metrics import accuracy_score

# Download stopwords and punkt tokenizer if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Initialize Lancaster Stemmer
lancaster = LancasterStemmer()

# Remove stopwords, tokenize, perform POS tagging, and apply Lancaster stemming
def preprocess_text(review):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(review)  # Tokenize the input review
    filtered_words = [lancaster.stem(word.lower()) for word in words if word.lower() not in stop_words]  # Remove stopwords and apply Porter stemming
    return ' '.join(filtered_words)

# Function to predict sentiment
def predict_sentiment(review):
    text_without_stopwords = preprocess_text(review)
    blob = TextBlob(text_without_stopwords)
    sentiment_score = blob.sentiment.polarity
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")
# Start timer
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)
# End timer and calculate time taken
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Accuracy: 0.6676
              precision    recall  f1-score   support

           0       0.75      0.50      0.60      9935
           1       0.63      0.83      0.72     10065

    accuracy                           0.67     20000
   macro avg       0.69      0.67      0.66     20000
weighted avg       0.69      0.67      0.66     20000

Time taken: 118.22693824768066 seconds


--------------------------------------- Testblob with stop word removal + tokeizatiom + pos + SnowballStemmer

In [32]:
from textblob import TextBlob
import nltk
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import pandas as pd
from sklearn.metrics import accuracy_score

# Download stopwords and punkt tokenizer if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Initialize Snowball Stemmer
snowball = SnowballStemmer(language='english')

# Remove stopwords, tokenize, perform POS tagging, and apply Snowball stemming
def preprocess_text(review):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(review)  # Tokenize the input review
    filtered_words = [snowball.stem(word.lower()) for word in words if word.lower() not in stop_words]  # Remove stopwords and apply Porter stemming
    return ' '.join(filtered_words)

# Function to predict sentiment
def predict_sentiment(review):
    text_without_stopwords = preprocess_text(review)
    blob = TextBlob(text_without_stopwords)
    sentiment_score = blob.sentiment.polarity
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")

# Start timer
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)
# End timer and calculate time taken
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Accuracy: 0.65515
              precision    recall  f1-score   support

           0       0.81      0.40      0.53      9935
           1       0.60      0.91      0.73     10065

    accuracy                           0.66     20000
   macro avg       0.71      0.65      0.63     20000
weighted avg       0.71      0.66      0.63     20000

Time taken: 116.20505619049072 seconds


spacy

In [8]:
import time
import pandas as pd
import spacy
from textblob import TextBlob
from sklearn.metrics import accuracy_score, classification_report

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Function to preprocess text using spaCy for tokenization and lemmatization
def preprocess_text(review):
    # Tokenize the review
    doc = nlp(review)
    # Lemmatize and remove stopwords
    processed_text = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(processed_text)

# Function to predict sentiment
def predict_sentiment(review):
    text_without_stopwords = preprocess_text(review)
    blob = TextBlob(text_without_stopwords)
    sentiment_score = blob.sentiment.polarity
    if sentiment_score >= 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")

# Measure time taken
start_time = time.time()

# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}  # Define mapping
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)  # Convert test_data['sentiment'] to numerical
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)  # Convert predicted sentiments to numerical

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])

# Print classification report
classification_rep = classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'], target_names=["negative", "positive"])
print("Classification Report:")
print(classification_rep)

end_time = time.time()
execution_time = end_time - start_time

print("Accuracy:", accuracy)
print("Time taken:", execution_time, "seconds")


Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.53      0.66      9935
    positive       0.67      0.91      0.77     10065

    accuracy                           0.73     20000
   macro avg       0.76      0.72      0.71     20000
weighted avg       0.76      0.73      0.71     20000

Accuracy: 0.7253
Time taken: 900.827437877655 seconds


In [18]:
import nltk
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score
import time
# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Instantiate the VADER SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to predict sentiment using VADER
def predict_sentiment(review):
    # Get the polarity scores for the review
    scores = sid.polarity_scores(review)
    
    # Determine sentiment based on the compound score
    if scores['compound'] >= 0:
        return "positive"
    else:
        return "negative"

test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")  # Assuming the test data path is correct

# Measure time taken
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# Record end time
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Accuracy: 0.6998
              precision    recall  f1-score   support

           0       0.79      0.53      0.64      9935
           1       0.65      0.86      0.74     10065

    accuracy                           0.70     20000
   macro avg       0.72      0.70      0.69     20000
weighted avg       0.72      0.70      0.69     20000

Time taken: 65.13692378997803 seconds


------------------- vader+stopword removal--------------------------

In [17]:
import nltk
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import time
from sklearn.metrics import accuracy_score

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Instantiate the VADER SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Remove stopwords
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    words = review.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Function to predict sentiment using VADER
def predict_sentiment(review):
    # Remove stopwords
    review_without_stopwords = remove_stopwords(review)
    
    # Get the polarity scores for the review
    scores = sid.polarity_scores(review_without_stopwords)
    
    # Determine sentiment based on the compound score
    if scores['compound'] >= 0:
        return "positive"
    else:
        return "negative"

test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")  # Assuming the test data path is correct

# Measure time taken
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# Record end time
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Accuracy: 0.67825
              precision    recall  f1-score   support

           0       0.79      0.48      0.60      9935
           1       0.63      0.88      0.73     10065

    accuracy                           0.68     20000
   macro avg       0.71      0.68      0.66     20000
weighted avg       0.71      0.68      0.66     20000

Time taken: 48.84004282951355 seconds


------------------- vader+stopword removal + tokenization --------------------------

In [16]:
import nltk
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time
from sklearn.metrics import accuracy_score

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Instantiate the VADER SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Remove stopwords
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(review)  # Tokenize the review text
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Function to predict sentiment using VADER
def predict_sentiment(review):
    # Remove stopwords and tokenize the review
    review_without_stopwords = remove_stopwords(review)
    
    # Get the polarity scores for the review
    scores = sid.polarity_scores(review_without_stopwords)
    
    # Determine sentiment based on the compound score
    if scores['compound'] >= 0:
        return "positive"
    else:
        return "negative"

test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")  # Assuming the test data path is correct

# Measure time taken
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# Record end time
end_time = time.time()
# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Accuracy: 0.68105
              precision    recall  f1-score   support

           0       0.79      0.49      0.60      9935
           1       0.63      0.87      0.73     10065

    accuracy                           0.68     20000
   macro avg       0.71      0.68      0.67     20000
weighted avg       0.71      0.68      0.67     20000

Time taken: 89.58695030212402 seconds


In [15]:
import nltk
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re  # Import regular expression module
from sklearn.metrics import accuracy_score
import time

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Instantiate the VADER SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Remove stopwords, punctuation, special characters, and numbers
def remove_noise(review):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(review)  # Tokenize the review text
    cleaned_words = [word.lower() for word in words if word.lower() not in stop_words and word.isalpha()]  # Keep only alphabetic words
    return ' '.join(cleaned_words)

# Function to predict sentiment using VADER
def predict_sentiment(review):
    # Remove noise and tokenize the review
    review_cleaned = remove_noise(review)
    
    # Get the polarity scores for the review
    scores = sid.polarity_scores(review_cleaned)
    
    # Determine sentiment based on the compound score
    if scores['compound'] >= 0:
        return "positive"
    else:
        return "negative"

test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")  # Assuming the test data path is correct

# Record start time
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)
# Record end time
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))
print("Time taken:", end_time - start_time, "seconds")


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Accuracy: 0.6763
              precision    recall  f1-score   support

           0       0.79      0.48      0.59      9935
           1       0.63      0.87      0.73     10065

    accuracy                           0.68     20000
   macro avg       0.71      0.68      0.66     20000
weighted avg       0.71      0.68      0.66     20000

Time taken: 87.09370732307434 seconds


Porter stemmer

In [13]:
import nltk
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer  # Import Porter stemmer
from sklearn.metrics import accuracy_score
import time

# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Download Porter stemmer if not already downloaded
nltk.download('punkt')

# Instantiate the VADER SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Instantiate the PorterStemmer
ps = PorterStemmer()

# Function to predict sentiment using VADER
def predict_sentiment(review):
    # Tokenize the review text
    words = nltk.word_tokenize(review)
    
    # Apply Porter stemming to each word
    stemmed_words = [ps.stem(word) for word in words]
    
    # Join the stemmed words back into a single string
    stemmed_review = ' '.join(stemmed_words)
    
    # Get the polarity scores for the review
    scores = sid.polarity_scores(stemmed_review)
    
    # Determine sentiment based on the compound score
    if scores['compound'] >= 0:
        return "positive"
    else:
        return "negative"

test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")  # Assuming the test data path is correct

# Record start time
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)
# Record end time
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Accuracy: 0.67275
              precision    recall  f1-score   support

           0       0.76      0.50      0.61      9935
           1       0.63      0.84      0.72     10065

    accuracy                           0.67     20000
   macro avg       0.69      0.67      0.66     20000
weighted avg       0.69      0.67      0.66     20000



In [14]:
print("Time taken:", end_time - start_time, "seconds")

Time taken: 207.6756112575531 seconds


--------------------------------------- Testblob with stop word removal + tokeizatiom + pos + Lancaster stemmer ---------------------

In [12]:
import nltk
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import LancasterStemmer  # Import Lancaster stemmer
from sklearn.metrics import accuracy_score

# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Download Lancaster stemmer if not already downloaded
nltk.download('punkt')

# Instantiate the VADER SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Instantiate the LancasterStemmer
ls = LancasterStemmer()

# Function to predict sentiment using VADER
def predict_sentiment(review):
    # Tokenize the review text
    words = nltk.word_tokenize(review)
    
    # Apply Lancaster stemming to each word
    stemmed_words = [ls.stem(word) for word in words]
    
    # Join the stemmed words back into a single string
    stemmed_review = ' '.join(stemmed_words)
    
    # Get the polarity scores for the review
    scores = sid.polarity_scores(stemmed_review)
    
    # Determine sentiment based on the compound score
    if scores['compound'] >= 0:
        return "positive"
    else:
        return "negative"

test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")  # Assuming the test data path is correct

# Record start time
start_time = time.time()
# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)
# Record end time
end_time = time.time()
# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])
print("Accuracy:", accuracy)
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Accuracy: 0.69225
              precision    recall  f1-score   support

           0       0.72      0.63      0.67      9935
           1       0.67      0.76      0.71     10065

    accuracy                           0.69     20000
   macro avg       0.70      0.69      0.69     20000
weighted avg       0.70      0.69      0.69     20000



SnowballStemmer

In [11]:
import nltk
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import SnowballStemmer  # Import Snowball stemmer
from sklearn.metrics import accuracy_score
import time

# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Download Snowball stemmer if not already downloaded
nltk.download('punkt')

# Instantiate the VADER SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Instantiate the SnowballStemmer
ss = SnowballStemmer('english')

# Function to predict sentiment using VADER
def predict_sentiment(review):
    # Tokenize the review text
    words = nltk.word_tokenize(review)
    
    # Apply Snowball stemming to each word
    stemmed_words = [ss.stem(word) for word in words]
    
    # Join the stemmed words back into a single string
    stemmed_review = ' '.join(stemmed_words)
    
    # Get the polarity scores for the review
    scores = sid.polarity_scores(stemmed_review)
    
    # Determine sentiment based on the compound score
    if scores['compound'] >= 0:
        return "positive"
    else:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")  # Assuming the test data path is correct

# Record start time
start_time = time.time()

# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# Record end time
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])

# Print accuracy and time taken
print("Accuracy:", accuracy)
print("Time taken:", end_time - start_time, "seconds")
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Accuracy: 0.6807
Time taken: 162.8048837184906 seconds
              precision    recall  f1-score   support

           0       0.75      0.53      0.62      9935
           1       0.64      0.83      0.72     10065

    accuracy                           0.68     20000
   macro avg       0.70      0.68      0.67     20000
weighted avg       0.70      0.68      0.67     20000



spacy

In [9]:
import pandas as pd
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score
import time

# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Instantiate the VADER SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to preprocess text using spaCy for tokenization and lemmatization
def preprocess_text(review):
    # Tokenize the review
    doc = nlp(review)
    # Lemmatize and remove stopwords
    processed_text = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(processed_text)

# Function to predict sentiment using VADER
def predict_sentiment(review):
    # Preprocess the review text
    processed_review = preprocess_text(review)
    # Get the polarity scores for the review
    scores = sid.polarity_scores(processed_review)
    # Determine sentiment based on the compound score
    if scores['compound'] >= 0:
        return "positive"
    else:
        return "negative"

# Load test data
test_data = pd.read_csv("/kaggle/input/lexicondataset1/test.csv/test.csv")  # Assuming the test data path is correct

# Record start time
start_time = time.time()

# Apply sentiment analysis on the test set
test_data['Predicted_Sentiment'] = test_data['review'].apply(predict_sentiment)

# Record end time
end_time = time.time()

# Map string labels to numerical labels
label_mapping = {"positive": 1, "negative": 0}
test_data['Sentiment_Num'] = test_data['sentiment'].map(label_mapping)
test_data['Predicted_Sentiment_Num'] = test_data['Predicted_Sentiment'].map(label_mapping)

# Calculate accuracy
accuracy = accuracy_score(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num'])

# Print accuracy and time taken
print("Accuracy:", accuracy)
print("Time taken:", end_time - start_time, "seconds")



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Accuracy: 0.6762
Time taken: 927.3920292854309 seconds


In [10]:
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(test_data['Sentiment_Num'], test_data['Predicted_Sentiment_Num']))

              precision    recall  f1-score   support

           0       0.78      0.49      0.60      9935
           1       0.63      0.86      0.73     10065

    accuracy                           0.68     20000
   macro avg       0.70      0.67      0.66     20000
weighted avg       0.70      0.68      0.66     20000

