# First Annotation with VADER

The samples from the last file are assigned to a sentiment class using VADER inital. VADER uses a lexicon-based approach for this. There are three sentiment classes (positive, neutral and negative).

In [4]:
import pandas as pd
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk

In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

file_path = 'Export_csv/ImportantCSV/comments_v2.csv'
df = pd.read_csv(file_path, delimiter=',')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tobiasbronold/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/tobiasbronold/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tobiasbronold/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tobiasbronold/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tobiasbronold/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tobiasbronold/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Use Vader for labeling
The text is cleaned up and processed. The sentiment it contains is analyzed by SentimentIntensityAnalyzer and the results are then saved as processed texts and sentiment values in a DataFrame

In [6]:
analyzer = SentimentIntensityAnalyzer()
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

def analyze_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

df['sentiment_score'] = df['processed_text'].apply(analyze_sentiment)
print(df[['text', 'processed_text', 'sentiment_score']])

                                                    text  \
0                               2025 and still laughing.   
1      DECADES AGO-  Saturday night live used to be e...   
2                       NY by way of 2nd city and Canada   
3      &quot;RACIST&quot; from Kentucky here in &#39;...   
4      Good to know if you don’t vote the way dems wa...   
...                                                  ...   
62125                                             See ya   
62126                                             So sad   
62127  Come to Britain. You can live in your socialis...   
62128                                                 😂😂   
62129                                          Goodbye..   

                                          processed_text  sentiment_score  
0                                     2025 still laugh .           0.5574  
1      decad ago- saturday night live use entertain ....          -0.7269  
2                                 ny way 2nd citi c

# ATTENTION AND TODO
*Manual labeling is required here. The corrected label must be in the ‘manual’ column in the csv.* 

This code classifies sentiment scores as positive, negative or neutral, compares the classification with a manual assessment and checks whether they match. The number of correct predictions and the accuracy are then calculated and output.

In [7]:
def get_vader_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

df['vader_sentiment'] = df['sentiment_score'].apply(get_vader_sentiment)

df['correctly_labeled'] = df['vader_sentiment'] == df['manuell']

correctly_labeled_count = df['correctly_labeled'].sum()
total_count = len(df)

print(f'Korrekte Vorhersagen: {correctly_labeled_count}')
print(f'Datensätze insgesamt: {total_count}')
print(f'Accuracy: {correctly_labeled_count / total_count:.2%}') 

KeyError: 'manuell'