In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, classification_report

# Initialize VADER Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to calculate sentiment scores of the sentence
def sentiment_scores(sentence):
    # polarity_scores method of SentimentIntensityAnalyzer object gives a sentiment dictionary
    sentiment_dict = analyzer.polarity_scores(sentence)
    
    # Normalize positive and negative scores so they add up to 1
    pos_score = sentiment_dict['pos']
    neg_score = sentiment_dict['neg']
    total_score = pos_score + neg_score

    if total_score != 0:
        pos_score /= total_score
        neg_score /= total_score
    
    if sentiment_dict['compound'] >= 0.05:
        sentiment = "Positive"
    else:
        sentiment = "Negative"

    return {
        "Tweet": sentence,
        "Negative": neg_score,
        "Positive": pos_score,
        "Overall Sentiment": sentiment
    }

# Function to determine the overall sentiment using VADER
def text_sentiment_vader(text):
    vs = analyzer.polarity_scores(text)
    return int(vs.get("compound") > 0)  # 1 for Positive, 0 for Negative

# Read the CSV file into a DataFrame
data_path = r"D:\uni dina\PSM2\_CODING\DATASET\preprocessed_data_v2.csv"
df = pd.read_csv(data_path)

# List to store sentiment scores
sentiment_data = []

# Apply VADER sentiment analysis to each tweet
for index, row in df.iterrows():
    tweet = row['tweet_content']
    sentiment_data.append(sentiment_scores(tweet))

# Convert the list of dictionaries to a DataFrame
sentiment_df = pd.DataFrame(sentiment_data)

# Save the DataFrame to a new CSV file
output_path = r"D:\uni dina\PSM2\_CODING\DATASET\sentiment_analysis_v2.csv"
sentiment_df.to_csv(output_path, index=False)

# Print the first few rows of the updated DataFrame to check the results
print(sentiment_df.head())

# Assuming df_test and targets are available for the accuracy check
df_test = df  # Assuming you want to use the same dataframe for predictions
predictions = df_test['tweet_content'].map(lambda x: text_sentiment_vader(x))
targets = df_test['label']  # Assuming there is a 'label' column in your dataframe

# Calculate and print the accuracy
accuracy = accuracy_score(predictions.values, targets)
#print(f"Accuracy: {accuracy:.2f}")

# Calculate and print the classification report
report = classification_report(targets, predictions, target_names=["Negative", "Positive"])
print("Report:\n", report)

print(f"Accuracy: {accuracy * 100:.2f}%")


                                               Tweet  Negative  Positive  \
0  lack understand small signific part caus anxie...  1.000000  0.000000   
1  told parent depress hard get gen x peopl under...  1.000000  0.000000   
2  depress someth dont speak even go also doubl e...  0.489189  0.510811   
3  made tortilla fill pbj depress cure olivia dep...  1.000000  0.000000   
4  gon na need depress med soon rainout spin equi...  1.000000  0.000000   

  Overall Sentiment  
0          Negative  
1          Negative  
2          Positive  
3          Negative  
4          Negative  
Report:
               precision    recall  f1-score   support

    Negative       0.46      0.67      0.54      4056
    Positive       0.39      0.21      0.27      4077

    accuracy                           0.44      8133
   macro avg       0.42      0.44      0.41      8133
weighted avg       0.42      0.44      0.41      8133

Accuracy: 43.87%
