In [1]:
# Install and import nltk
!pip install nltk
import nltk




In [2]:
# Download the lexicon
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Tomek\AppData\Roaming\nltk_data...


True

In [3]:
# Import the lexicon 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
# Create an instance of SentimentIntensityAnalyzer
sent_analyzer = SentimentIntensityAnalyzer()

In [5]:
# warmup sentecne
sentence = "Darth Vader is one of the most powerfull sith lords in the history"
print(sent_analyzer.polarity_scores(sentence))


{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [6]:
sentence1 = "Darth Vader the most evil character in entire Star Wars saga"
print(sent_analyzer.polarity_scores(sentence1))

{'neg': 0.48, 'neu': 0.52, 'pos': 0.0, 'compound': -0.8516}


In [7]:
import pandas as pd

# Read the data set
data_url = "https://raw.githubusercontent.com/keitazoumana/VADER_sentiment-Analysis/main/data/testdata.manual.2009.06.14.csv"
sentiment_data = pd.read_csv(data_url)

sentiment_data.head(3)

Unnamed: 0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,"@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right."
0,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
2,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...


In [8]:

def format_data(data):

  last_col = str(data.columns[-1])
  first_col = str(data.columns[0])

  data.rename(columns = {last_col: 'tweet_text', first_col: 'polarity'}, inplace=True) 

  # Change 0, 2, 4 to negative, neutral and positive
  labels = {0: 'negative', 2: 'neutral', 4: 'positive'}
  data['polarity'] = data['polarity'].map(labels)

  # Get only the two columns
  return data[['tweet_text', 'polarity']]

# Apply the transformation
data = format_data(sentiment_data)
data.head(3)

Unnamed: 0,tweet_text,polarity
0,Reading my kindle2... Love it... Lee childs i...,positive
1,"Ok, first assesment of the #kindle2 ...it fuck...",positive
2,@kenburbary You'll love your Kindle2. I've had...,positive


In [9]:
def format_output(output_dict):
  
  polarity = "neutral"

  if(output_dict['compound']>= 0.05):
    polarity = "positive"

  elif(output_dict['compound']<= -0.05):
    polarity = "negative"

  return polarity

def predict_sentiment(text):
  
  output_dict =  sent_analyzer.polarity_scores(text)
  return format_output(output_dict)

# Run the predictions
data["vader_prediction"] = data["tweet_text"].apply(predict_sentiment)

# Show 5 random rows of the data
data.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["vader_prediction"] = data["tweet_text"].apply(predict_sentiment)


Unnamed: 0,tweet_text,polarity,vader_prediction
100,Life?s a bitch? and so is Dick Cheney. #p2 #bi...,negative,negative
305,I'm really loving the new search site Wolfram/...,positive,positive
206,All-Star Basketball Classic Tuesday Features T...,neutral,positive
390,could time-warner cable suck more? NO.,negative,negative
14,"#lebron best athlete of our generation, if not...",positive,positive


In [10]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(data['polarity'], data['vader_prediction'])

print("Accuracy: {}\n".format(accuracy))

# Show the classification report
print(classification_report(data['polarity'], data['vader_prediction']))

Accuracy: 0.716297786720322

              precision    recall  f1-score   support

    negative       0.84      0.64      0.72       177
     neutral       0.67      0.70      0.68       139
    positive       0.67      0.81      0.73       181

    accuracy                           0.72       497
   macro avg       0.73      0.71      0.71       497
weighted avg       0.73      0.72      0.72       497

