In [1]:
import pandas as pd

# Load the provided dataset
data  = pd.read_csv('combined_all_data_no_duplicates.csv')

# Display the first few rows to understand its structure
data.head()

Unnamed: 0,pid,type,code,title,score,subreddit,comments
0,p0001,popular,mh_wp,"Apparently, finding interpreters is impossible.",88,deaf,This is absolutely unacceptable. I am an inte...
1,p0002,popular,mh_wp,I hate being deaf,82,deaf,The DMV? I would not have guessed that would b...
2,p0003,popular,mh_wp,rant about this community,62,deaf,I'm so sorry you received so much negative ene...
3,p0004,popular,mh_wp,Mental Health Issues In Deaf Community,63,deaf,I don't remember where I saw this so I can't c...
4,p0005,popular,mh_wp,Relationship advice: deaf husband/hearing wife...,57,deaf,"I’m deaf, my husband is hearing. I’m glad you’..."


In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk


nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

#preprocess the text
def preprocess_text(text):
    # Remove non-alphabetical characters, convert to lowercase, tokenize, remove stopwords, and lemmatize
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    text = text.lower()
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    
    return ' '.join(words)

# Apply the preprocessing to 'comments'
data['cleaned_comments'] = data['comments'].apply(preprocess_text)
data[['comments', 'cleaned_comments']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nitya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nitya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,comments,cleaned_comments
0,This is absolutely unacceptable. I am an inte...,absolutely unacceptable interpreter worked men...
1,The DMV? I would not have guessed that would b...,dmv would guessed would deaf friendly workplace
2,I'm so sorry you received so much negative ene...,im sorry received much negative energy missed ...
3,I don't remember where I saw this so I can't c...,dont remember saw cant cite researching someth...
4,"I’m deaf, my husband is hearing. I’m glad you’...",im deaf husband hearing im glad youre proactiv...


In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment_vader(text):
    scores = analyzer.polarity_scores(text)
    sentiment_label = 'neutral'
    if scores['compound'] >= 0.05:
        sentiment_label = 'positive'
    elif scores['compound'] <= -0.05:
        sentiment_label = 'negative'
    return scores, sentiment_label

data['vader_scores'], data['vader_sentiment'] = zip(*data['cleaned_comments'].apply(analyze_sentiment_vader))

# Split out individual sentiment scores
data['vader_pos'] = data['vader_scores'].apply(lambda x: x['pos'])
data['vader_neu'] = data['vader_scores'].apply(lambda x: x['neu'])
data['vader_neg'] = data['vader_scores'].apply(lambda x: x['neg'])
data['vader_compound'] = data['vader_scores'].apply(lambda x: x['compound'])

data = data.drop(columns=['vader_scores'])
print(data[['comments', 'cleaned_comments', 'vader_sentiment', 'vader_pos', 'vader_neu', 'vader_neg', 'vader_compound']].head())

# Save the results to a CSV file 
data.to_csv('social-media_data/output_sentiment_analysis/vader_sentiment_analysis_results.csv', index=False)


                                            comments  \
0  This is absolutely unacceptable.  I am an inte...   
1  The DMV? I would not have guessed that would b...   
2  I'm so sorry you received so much negative ene...   
3  I don't remember where I saw this so I can't c...   
4  I’m deaf, my husband is hearing. I’m glad you’...   

                                    cleaned_comments vader_sentiment  \
0  absolutely unacceptable interpreter worked men...        negative   
1    dmv would guessed would deaf friendly workplace        positive   
2  im sorry received much negative energy missed ...        positive   
3  dont remember saw cant cite researching someth...        positive   
4  im deaf husband hearing im glad youre proactiv...        positive   

   vader_pos  vader_neu  vader_neg  vader_compound  
0      0.149      0.687      0.165         -0.3989  
1      0.348      0.652      0.000          0.4939  
2      0.209      0.706      0.085          0.9956  
3      0.193      

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

data['sentiment_label'] = data['vader_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

X = data['cleaned_comments']
y = data['sentiment_label']
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Random Forest Classifier with balanced class weight 
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7647058823529411
[[ 2  0  8]
 [ 0  0  1]
 [ 3  0 37]]
              precision    recall  f1-score   support

           0       0.40      0.20      0.27        10
           1       0.00      0.00      0.00         1
           2       0.80      0.93      0.86        40

    accuracy                           0.76        51
   macro avg       0.40      0.38      0.38        51
weighted avg       0.71      0.76      0.73        51



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
