<a href="https://colab.research.google.com/github/SidAS-ai/AI-Powered-Behavioral-Analysis-for-Suicide-Prevention/blob/main/sentiment_analysis_of_reddit_and_twitter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install pandas numpy tweepy praw nltk vaderSentiment textblob scikit-learn folium plotly matplotlib seaborn spacy


Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Installing collected packages: vaderSentiment, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 vaderSentiment-3.3.2


In [7]:
import pandas as pd
import numpy as np
import re
import json
import datetime
import tweepy
import praw
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import folium
from folium.plugins import HeatMap
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from collections import Counter
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Load spaCy model for named entity recognition (location extraction)
nlp = spacy.load("en_core_web_sm")

# Define crisis-related keywords for filtering
CRISIS_KEYWORDS = [
    "depressed", "depression", "suicide", "suicidal", "kill myself",
    "end my life", "addiction", "overdose", "anxious", "anxiety",
    "overwhelmed", "hopeless", "helpless", "self harm", "cutting"
]

# Define high-risk phrases that indicate immediate concern
HIGH_RISK_PHRASES = [
    "want to die", "don't want to live", "kill myself", "end my life",
    "no reason to live", "better off dead", "going to end it",
    "suicide plan", "last day", "goodbye world", "final note"
]

class SocialMediaDataExtractor:
    def __init__(self, twitter_credentials=None, reddit_credentials=None):
        self.twitter_api = self.setup_twitter_api(twitter_credentials) if twitter_credentials else None
        self.reddit_api = self.setup_reddit_api(reddit_credentials) if reddit_credentials else None

    def setup_twitter_api(self, credentials):
        """Set up Twitter/X API authentication"""
        auth = tweepy.OAuthHandler(credentials['api_key'], credentials['api_secret'])
        auth.set_access_token(credentials['access_token'], credentials['access_token_secret'])
        api = tweepy.API(auth, wait_on_rate_limit=True)
        return api

    def setup_reddit_api(self, credentials):
        """Set up Reddit API authentication"""
        reddit = praw.Reddit(
            client_id=credentials['client_id'],
            client_secret=credentials['client_secret'],
            user_agent=credentials['user_agent']
        )
        return reddit

    def extract_twitter_data(self, keywords, count=100):
        """Extract tweets based on keywords"""
        all_tweets = []

        for keyword in keywords:
            try:
                tweets = self.twitter_api.search_tweets(q=keyword, count=count, tweet_mode="extended", lang="en")
                for tweet in tweets:
                    tweet_data = {
                        'id': tweet.id_str,
                        'timestamp': tweet.created_at.strftime('%Y-%m-%d %H:%M:%S'),
                        'content': tweet.full_text,
                        'likes': tweet.favorite_count,
                        'retweets': tweet.retweet_count,
                        'user_location': tweet.user.location if tweet.user.location else None,
                        'coordinates': tweet.coordinates['coordinates'] if tweet.coordinates else None,
                        'place': tweet.place.full_name if tweet.place else None,
                        'source': 'Twitter'
                    }
                    all_tweets.append(tweet_data)
            except Exception as e:
                logging.error(f"Error extracting tweets for keyword '{keyword}': {e}")

        return all_tweets

    def extract_reddit_data(self, keywords, limit=100):
        """Extract Reddit posts based on keywords"""
        all_posts = []

        subreddits = ['depression', 'SuicideWatch', 'addiction', 'mentalhealth', 'Anxiety']
        for subreddit in subreddits:
            try:
                for submission in self.reddit_api.subreddit(subreddit).hot(limit=limit):
                    # Check if any keyword is in the title or selftext
                    if any(keyword.lower() in submission.title.lower() or
                           (submission.selftext and keyword.lower() in submission.selftext.lower())
                           for keyword in keywords):
                        post_data = {
                            'id': submission.id,
                            'timestamp': datetime.datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                            'title': submission.title,
                            'content': submission.selftext,
                            'score': submission.score,
                            'comments': submission.num_comments,
                            'subreddit': submission.subreddit.display_name,
                            'source': 'Reddit'
                        }
                        all_posts.append(post_data)
            except Exception as e:
                logging.error(f"Error extracting posts from r/{subreddit}: {e}")

        return all_posts

class TextPreprocessor:
    @staticmethod
    def preprocess_text(text):
        """Clean and preprocess text data"""
        if not text:
            return ""

        # Convert to lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)

        # Remove special characters and numbers
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)

        # Remove emojis
        text = re.sub(r'[^\x00-\x7F]+', '', text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

        # Lemmatize
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Join tokens back into string
        cleaned_text = ' '.join(tokens)

        return cleaned_text

class DataProcessor:
    @staticmethod
    def create_dataset(twitter_data, reddit_data):
        """Combine and standardize data from different sources"""
        combined_data = []

        # Process Twitter data
        for tweet in twitter_data:
            item = {
                'id': tweet['id'],
                'timestamp': tweet['timestamp'],
                'content': tweet['content'],
                'cleaned_content': TextPreprocessor.preprocess_text(tweet['content']),
                'engagement': tweet['likes'] + tweet['retweets'],
                'location': tweet['user_location'],
                'coordinates': tweet['coordinates'],
                'place': tweet['place'],
                'source': tweet['source']
            }
            combined_data.append(item)

        # Process Reddit data
        for post in reddit_data:
            content = post['title'] + " " + (post['content'] if post['content'] else "")
            item = {
                'id': post['id'],
                'timestamp': post['timestamp'],
                'content': content,
                'cleaned_content': TextPreprocessor.preprocess_text(content),
                'engagement': post['score'] + post['comments'],
                'location': None,  # Reddit doesn't provide location data
                'coordinates': None,
                'place': None,
                'source': post['source'] + " (r/" + post['subreddit'] + ")"
            }
            combined_data.append(item)

        # Convert to DataFrame
        df = pd.DataFrame(combined_data)

        return df

class SentimentAnalyzer:
    @staticmethod
    def analyze_sentiment(df):
        """Apply sentiment analysis to the dataset"""
        # Initialize sentiment analyzers
        vader = SentimentIntensityAnalyzer()

        # Create empty columns for sentiment scores
        df['vader_neg'] = 0.0
        df['vader_neu'] = 0.0
        df['vader_pos'] = 0.0
        df['vader_compound'] = 0.0
        df['textblob_polarity'] = 0.0
        df['textblob_subjectivity'] = 0.0

        # Apply sentiment analysis to each row
        for idx, row in df.iterrows():
            text = row['content']

            # VADER sentiment
            vader_scores = vader.polarity_scores(text)
            df.at[idx, 'vader_neg'] = vader_scores['neg']
            df.at[idx, 'vader_neu'] = vader_scores['neu']
            df.at[idx, 'vader_pos'] = vader_scores['pos']
            df.at[idx, 'vader_compound'] = vader_scores['compound']

            # TextBlob sentiment
            blob = TextBlob(text)
            df.at[idx, 'textblob_polarity'] = blob.sentiment.polarity
            df.at[idx, 'textblob_subjectivity'] = blob.sentiment.subjectivity

        # Determine sentiment category based on compound score
        df['sentiment'] = df['vader_compound'].apply(
            lambda x: 'Positive' if x >= 0.05 else ('Negative' if x <= -0.05 else 'Neutral')
        )

        return df

class RiskClassifier:
    @staticmethod
    def classify_risk_level(df):
        """Classify posts based on risk level"""
        # Function to check if any high-risk phrase is in the text
        def contains_high_risk(text):
            if not isinstance(text, str):
                return False
            text = text.lower()
            return any(phrase in text for phrase in HIGH_RISK_PHRASES)

        # Function to calculate TF-IDF similarity with crisis terms
        def calculate_crisis_similarity(text):
            if not isinstance(text, str) or text.strip() == "":
                return 0

            # Create a small corpus with crisis terms and the text
            corpus = [" ".join(CRISIS_KEYWORDS), text]

            # Calculate TF-IDF
            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform(corpus)

            # Calculate cosine similarity
            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
            return similarity

        # Apply risk classification
        df['contains_high_risk'] = df['content'].apply(contains_high_risk)
        df['crisis_similarity'] = df['cleaned_content'].apply(calculate_crisis_similarity)

        # Assign risk levels
        def assign_risk_level(row):
            if row['contains_high_risk']:
                return 'High-Risk'
            elif row['crisis_similarity'] > 0.3:
                return 'Moderate Concern'
            else:
                return 'Low Concern'

        df['risk_level'] = df.apply(assign_risk_level, axis=1)

        return df

class Visualizer:
    @staticmethod
    def visualize_sentiment_risk(df):
        """Create visualizations for sentiment and risk categories"""
        # Create a crosstab of sentiment and risk level
        crosstab = pd.crosstab(df['sentiment'], df['risk_level'])

        # Plotting
        plt.figure(figsize=(12, 8))

        # Plot 1: Sentiment Distribution
        plt.subplot(2, 2, 1)
        sns.countplot(data=df, x='sentiment', palette='viridis')
        plt.title('Sentiment Distribution')
        plt.xticks(rotation=45)

        # Plot 2: Risk Level Distribution
        plt.subplot(2, 2, 2)
        sns.countplot(data=df, x='risk_level', palette='viridis')
        plt.title('Risk Level Distribution')
        plt.xticks(rotation=45)

        # Plot 3: Sentiment vs Risk Level
        plt.subplot(2, 1, 2)
        sns.heatmap(crosstab, annot=True, cmap='YlGnBu', fmt='d')
        plt.title('Sentiment vs Risk Level')

        plt.tight_layout()
        plt.savefig('sentiment_risk_analysis.png')
        plt.close()

        return crosstab

class Geolocator:
    @staticmethod
    def extract_locations(df):
        """Extract location information from text and metadata"""
        def extract_location_from_text(text):
            if not isinstance(text, str):
                return None

            doc = nlp(text)
            locations = []

            # Extract locations from named entities
            for ent in doc.ents:
                if ent.label_ in ['GPE', 'LOC']:
                    locations.append(ent.text)

            return locations[0] if locations else None

        # Extract locations from text content
        df['extracted_location'] = df['content'].apply(extract_location_from_text)

        # Combine available location information
        df['final_location'] = df['place'].fillna(df['location']).fillna(df['extracted_location'])

        return df

    @staticmethod
    def geocode_locations(df):
        """Convert location names to coordinates using a mock geocoding service"""
        # This is a simplified mock geocoding function
        # In a real-world scenario, you would use a geocoding service like Google Maps API
        mock_geocoding = {
            'new york': (40.7128, -74.0060),
            'los angeles': (34.0522, -118.2437),
            'chicago': (41.8781, -87.6298),
            'houston': (29.7604, -95.3698),
            'phoenix': (33.4484, -112.0740),
            'philadelphia': (39.9526, -75.1652),
            'san antonio': (29.4241, -98.4936),
            'san diego': (32.7157, -117.1611),
            'dallas': (32.7767, -96.7970),
            'san jose': (37.3382, -121.8863),
            'austin': (30.2672, -97.7431),
            'jacksonville': (30.3322, -81.6557),
            'san francisco': (37.7749, -122.4194),
            'seattle': (47.6062, -122.3321),
            'boston': (42.3601, -71.0589)
        }

        def get_coordinates(location):
            if not isinstance(location, str):
                return None

            location = location.lower()

            # Check if the location or part of it is in our mock database
            for loc, coords in mock_geocoding.items():
                if loc in location:
                    return coords

            return None

        df['geocoded_coords'] = df['final_location'].apply(get_coordinates)

        return df

    @staticmethod
    def create_heatmap(df):
        """Create a heatmap of crisis-related posts"""
        # Filter rows with valid coordinates
        geo_df = df.dropna(subset=['geocoded_coords'])

        if len(geo_df) == 0:
            logging.warning("No valid coordinates found for mapping.")
            return None

        # Extract coordinates
        coordinates = geo_df['geocoded_coords'].tolist()

        # Create a map centered on the US
        m = folium.Map(location=[39.8283, -98.5795], zoom_start=4)

        # Add heatmap layer
        HeatMap(coordinates).add_to(m)

        # Add markers for high-risk posts
        high_risk_df = geo_df[geo_df['risk_level'] == 'High-Risk']
        for _, row in high_risk_df.iterrows():
            folium.Marker(
                location=row['geocoded_coords'],
                popup=f"Risk Level: {row['risk_level']}<br>Sentiment: {row['sentiment']}",
                icon=folium.Icon(color='red', icon='info-sign')
            ).add_to(m)

        # Save the map
        m.save('crisis_heatmap.html')

        return m

    @staticmethod
    def top_crisis_locations(df):
        """Identify top locations with highest crisis discussions"""
        # Count posts by location
        location_counts = df.dropna(subset=['final_location'])['final_location'].value_counts().head(5)

        # Create a bar chart
        plt.figure(figsize=(10, 6))
        location_counts.plot(kind='bar', color='teal')
        plt.title('Top 5 Locations with Highest Crisis Discussions')
        plt.xlabel('Location')
        plt.ylabel('Number of Posts')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('top_crisis_locations.png')
        plt.close()

        return location_counts

class CrisisAnalysisPipeline:
    def __init__(self, twitter_credentials=None, reddit_credentials=None, use_sample_data=True):
        self.data_extractor = SocialMediaDataExtractor(twitter_credentials, reddit_credentials)
        self.use_sample_data = use_sample_data

    def run_pipeline(self):
        """Run the complete social media crisis analysis pipeline"""

        # Step 1: Data Collection
        logging.info("Step 1: Collecting social media data...")

        if self.use_sample_data:
            # Use sample data for demonstration
            logging.info("Using sample data for demonstration...")

            # Create sample Twitter data
            sample_twitter_data = [
                {
                    'id': '1234567890',
                    'timestamp': '2025-03-15 10:30:45',
                    'content': 'I feel so depressed lately. Nothing seems to help. #mentalhealth',
                    'likes': 5,
                    'retweets': 2,
                    'user_location': 'New York, NY',
                    'coordinates': None,
                    'place': 'New York',
                    'source': 'Twitter'
                },
                {
                    'id': '0987654321',
                    'timestamp': '2025-03-15 09:15:30',
                    'content': 'The anxiety is overwhelming. I don\'t know how much longer I can take this.',
                    'likes': 10,
                    'retweets': 3,
                    'user_location': 'Los Angeles, CA',
                    'coordinates': None,
                    'place': 'Los Angeles',
                    'source': 'Twitter'
                },
                {
                    'id': '5678901234',
                    'timestamp': '2025-03-15 11:45:22',
                    'content': 'I want to die. Nobody would even notice if I was gone. #suicidal',
                    'likes': 3,
                    'retweets': 1,
                    'user_location': 'Chicago, IL',
                    'coordinates': None,
                    'place': 'Chicago',
                    'source': 'Twitter'
                }
            ]

            # Create sample Reddit data
            sample_reddit_data = [
                {
                    'id': 'abc123',
                    'timestamp': '2025-03-15 14:20:15',
                    'title': 'Need help with addiction',
                    'content': 'I\'ve been struggling with substance abuse for years and I don\'t know where to turn.',
                    'score': 25,
                    'comments': 12,
                    'subreddit': 'addiction',
                    'source': 'Reddit'
                },
                {
                    'id': 'def456',
                    'timestamp': '2025-03-15 16:05:33',
                    'title': 'Lost all hope',
                    'content': 'I don\'t see any reason to keep going. I\'ve tried everything and nothing works.',
                    'score': 15,
                    'comments': 8,
                    'subreddit': 'depression',
                    'source': 'Reddit'
                },
                {
                    'id': 'ghi789',
                    'timestamp': '2025-03-15 17:30:10',
                    'title': 'Overwhelmed in Boston',
                    'content': 'Living in Boston and feeling completely overwhelmed with life. Can anyone in the area recommend resources?',
                    'score': 20,
                    'comments': 15,
                    'subreddit': 'mentalhealth',
                    'source': 'Reddit'
                }
            ]
        else:
            # Use real API connections
            if self.data_extractor.twitter_api:
                sample_twitter_data = self.data_extractor.extract_twitter_data(CRISIS_KEYWORDS)
            else:
                sample_twitter_data = []
                logging.warning("No Twitter credentials provided. Skipping Twitter data collection.")

            if self.data_extractor.reddit_api:
                sample_reddit_data = self.data_extractor.extract_reddit_data(CRISIS_KEYWORDS)
            else:
                sample_reddit_data = []
                logging.warning("No Reddit credentials provided. Skipping Reddit data collection.")

        # Step 2: Create and preprocess dataset
        logging.info("Step 2: Creating and preprocessing dataset...")
        df = DataProcessor.create_dataset(sample_twitter_data, sample_reddit_data)

        # Save the clean dataset
        df.to_csv('cleaned_crisis_data.csv', index=False)
        logging.info(f"Cleaned dataset saved with {len(df)} entries.")

        # Step 3: Sentiment Analysis
        logging.info("Step 3: Applying sentiment analysis...")
        df = SentimentAnalyzer.analyze_sentiment(df)

        # Step 4: Risk Classification
        logging.info("Step 4: Classifying risk levels...")
        df = RiskClassifier.classify_risk_level(df)

        # Step 5: Visualize sentiment and risk
        logging.info("Step 5: Creating sentiment and risk visualizations...")
        sentiment_risk_table = Visualizer.visualize_sentiment_risk(df)
        logging.info("Sentiment and risk visualizations created.")

        # Step 6: Extract and geocode locations
        logging.info("Step 6: Extracting and geocoding locations...")
        df = Geolocator.extract_locations(df)
        df = Geolocator.geocode_locations(df)

        # Step 7: Create heatmap
        logging.info("Step 7: Creating crisis heatmap...")
        heatmap = Geolocator.create_heatmap(df)
        if heatmap:
            logging.info("Crisis heatmap created and saved as 'crisis_heatmap.html'.")

        # Step 8: Identify top crisis locations
        logging.info("Step 8: Identifying top crisis locations...")
        top_locations = Geolocator.top_crisis_locations(df)
        logging.info("Top crisis locations identified and visualization saved.")

        # Return the processed dataframe
        return df

# Example usage
if __name__ == "__main__":
    # Run the pipeline with sample data
    pipeline = CrisisAnalysisPipeline(use_sample_data=True)
    crisis_data = pipeline.run_pipeline()
    logging.info("Crisis analysis pipeline completed successfully.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df, x='sentiment', palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df, x='risk_level', palette='viridis')


In [None]:
import