In [6]:
import pandas as pd

# Read the CSV files into pandas DataFrames
roberta_df = pd.read_csv('final_output_sentiment_RoBERTa_Vader (2).csv')
vader_df = pd.read_csv('sentiment_analysis_results_with_title.csv')

# Displaying first few rows of both datasets to analyze their structures
roberta_df.head(), vader_df.head()


(     pid     type   code                                              title  \
 0  p0001  popular  mh_wp    Apparently, finding interpreters is impossible.   
 1  p0002  popular  mh_wp                                  I hate being deaf   
 2  p0003  popular  mh_wp                          rant about this community   
 3  p0004  popular  mh_wp             Mental Health Issues In Deaf Community   
 4  p0005  popular  mh_wp  Relationship advice: deaf husband/hearing wife...   
 
    score subreddit                                           comments  \
 0     88      deaf  This is absolutely unacceptable.  I am an inte...   
 1     82      deaf  The DMV? I would not have guessed that would b...   
 2     62      deaf  I'm so sorry you received so much negative ene...   
 3     63      deaf  I don't remember where I saw this so I can't c...   
 4     57      deaf  I’m deaf, my husband is hearing. I’m glad you’...   
 
                                     cleaned_comments vader_sentiment  \

In [8]:
if 'pid' not in roberta_df.columns or 'pid' not in vader_df.columns:
    raise ValueError("Both datasets must contain a common identifier column (e.g., 'pid').")

# Merge datasets on the common identifier
combined_df = pd.merge(vader_df, roberta_df, on='pid', suffixes=('_vader', '_roberta'))

In [9]:
# Define helper functions
def normalize_roberta_scores(row):
    """Calculate a weighted score for RoBERTa using positive and negative probabilities."""
    return row['roBERTa_sentiment_positive'] - row['roBERTa_sentiment_negative']

def combine_sentiments(vader_score, roberta_score, weight_vader=0.4, weight_roberta=0.6):
    """Combine VADER and RoBERTa scores using weighted averaging."""
    return (vader_score * weight_vader) + (roberta_score * weight_roberta)

def map_to_overall_sentiment(score):
    """Map the combined score to a sentiment category."""
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Normalize RoBERTa scores
combined_df['roberta_weighted_score'] = combined_df.apply(normalize_roberta_scores, axis=1)

# Combine VADER title scores and RoBERTa comment scores
combined_df['overall_score'] = combined_df.apply(
    lambda row: combine_sentiments(row['vader_compound_title'], row['roberta_weighted_score']), axis=1
)

# Map the overall score to an overall sentiment category
combined_df['overall_sentiment'] = combined_df['overall_score'].apply(map_to_overall_sentiment)

# Save the results to a new CSV file
output_file = 'weighted_sentiment_results.csv'
combined_df.to_csv(output_file, index=False)

print(f"Weighted sentiment analysis saved to {output_file}")

Weighted sentiment analysis saved to weighted_sentiment_results.csv


In [11]:
combined_df

Unnamed: 0,pid,type_vader,code_vader,title_vader,score_vader,subreddit_vader,comments_vader,cleaned_comments_vader,vader_sentiment_vader,vader_pos_vader,...,vader_neu_roberta,vader_neg_roberta,vader_compound_roberta,roBERTa_sentiment_positive,roBERTa_sentiment_negative,roBERTa_sentiment_neutral,preponderant_sentiment,roberta_weighted_score,overall_score,overall_sentiment
0,p0001,popular,mh_wp,"Apparently, finding interpreters is impossible.",88,deaf,This is absolutely unacceptable. I am an inte...,absolutely unacceptable interpreter worked men...,negative,0.149,...,0.687,0.165,-0.3989,0.007028,0.921729,0.071243,negative,-0.914701,-0.548821,negative
1,p0002,popular,mh_wp,I hate being deaf,82,deaf,The DMV? I would not have guessed that would b...,dmv would guessed would deaf friendly workplace,positive,0.348,...,0.652,0.000,0.4939,0.059166,0.601448,0.339386,negative,-0.542282,-0.554129,negative
2,p0003,popular,mh_wp,rant about this community,62,deaf,I'm so sorry you received so much negative ene...,im sorry received much negative energy missed ...,positive,0.209,...,0.706,0.085,0.9956,0.362453,0.187739,0.449808,neutral,0.174714,-0.031171,neutral
3,p0004,popular,mh_wp,Mental Health Issues In Deaf Community,63,deaf,I don't remember where I saw this so I can't c...,dont remember saw cant cite researching someth...,positive,0.193,...,0.641,0.166,0.8622,0.069262,0.667851,0.262886,negative,-0.598589,-0.359153,negative
4,p0005,popular,mh_wp,Relationship advice: deaf husband/hearing wife...,57,deaf,"I’m deaf, my husband is hearing. I’m glad you’...",im deaf husband hearing im glad youre proactiv...,positive,0.254,...,0.598,0.148,0.9301,0.333181,0.277495,0.389324,neutral,0.055686,0.202012,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,p0289,recent,mh_pa,Lack of seating,28,disability,Not that this is relevant or necessary informa...,relevant necessary information justify complai...,positive,0.139,...,0.802,0.059,0.4588,0.036293,0.613112,0.350595,negative,-0.576819,-0.473371,negative
248,p0291,recent,mh_pa,‘Disabled’,45,disability,Ayyy I'm also 19 with several disabilities.\n\...,ayyy im also several disability mom genxer com...,negative,0.197,...,0.594,0.209,-0.3701,0.217780,0.387029,0.395192,neutral,-0.169249,-0.101549,negative
249,p0293,recent,mh_pa,Looking for advice/ opinions on accessible mar...,2,disability,Most adaptive sports organizations began with ...,adaptive sport organization began someone like...,positive,0.315,...,0.685,0.000,0.9805,0.822026,0.004189,0.173784,positive,0.817837,0.490702,positive
250,p0294,recent,mh_pa,Survival Jobs are not disability friendly.,192,disability,I feel this. I can’t stand for basically any p...,feel cant stand basically period time without ...,negative,0.113,...,0.684,0.202,-0.8651,0.016568,0.869631,0.113801,negative,-0.853063,-0.314278,negative
