In [11]:
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter
import os
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from wordcloud import WordCloud
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [24]:


class FOMOAnalyzer:
    def __init__(self, input_file, output_dir=None, report_prefix=None):
        
        self.sia = SentimentIntensityAnalyzer()
        
        # Define blacklist words
        self.blacklist = {
            'http', 'https', 'www', 'com', 'html', 'htm',
            'amp', 'rt', 'url', 'href', 'src', 'png', 'jpg',
            'jpeg', 'gif', 'pdf', 'xml', 'php', 'asp', 'js',
            'css', 'img', 'pic', 'download', 'click', 'link',
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
            'you', 'your', 'yours', 'yourself', 'yourselves',
            'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
            'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
            'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
            'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
            'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as',
            'even', 'also', 'going', 'things'
        }
        
        # Set input and output parameters
        self.input_file = input_file
        self.output_dir = output_dir or 'analysis_output'
        self.report_prefix = report_prefix or 'fomo_report'
        self.img_dir = f'{self.output_dir}/{report_prefix}'
        
        # Create output directories
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.img_dir, exist_ok=True)
        
        # Read and validate input data
        try:
            self.df = pd.read_csv(self.input_file, names=['timestamp', 'source', 'content', 'author', 'engagement'])
            self.df['timestamp'] = pd.to_datetime(self.df['timestamp'])
        except FileNotFoundError:
            raise FileNotFoundError(f"Input file not found: {self.input_file}")
        except Exception as e:
            raise Exception(f"Error reading input file: {str(e)}")
        
        # Initialize report data
        self.report_data = {
                'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'input_file': self.input_file,
                'total_posts': len(self.df),
                'date_range': f"{self.df['timestamp'].min().strftime('%Y-%m-%d')} to {self.df['timestamp'].max().strftime('%Y-%m-%d')}",
                'engagement_stats': {},
                'sentiment_analysis': {},
                'keyword_frequency': {},
            }

    def clean_text(self, text):

        # Convert to lowercase
        text = str(text).lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Tokenize using NLTK
        tokens = word_tokenize(text)
        
        # Get NLTK stopwords
        nltk_stop_words = set(stopwords.words('english'))
        all_stop_words = nltk_stop_words.union(self.blacklist)
        
        # Part of speech tagging
        pos_tags = pos_tag(tokens)
        
        # Filter words using multiple criteria
        filtered_words = []
        for word, pos in pos_tags:
            if (word.strip() and  # Not empty
                word not in all_stop_words and  # Not in stopwords
                len(word) > 2 and  # Longer than 2 characters
                not any(char.isdigit() for char in word) and  # No digits
                pos in ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']  # Only meaningful parts of speech
                ):
                filtered_words.append(word)
        
        return ' '.join(filtered_words)

    def analyze_filtered_content(self):

        # Get all words after cleaning
        all_words = []
        all_text = []  # for wordcloud
        for content in self.df['content']:
            cleaned_text = self.clean_text(content)
            cleaned_words = cleaned_text.split()
            all_words.extend(cleaned_words)
            all_text.append(cleaned_text)
        
        # Count word frequencies
        word_freq = Counter(all_words)
        
        # Store top words in report
        self.report_data['keyword_frequency'] = dict(word_freq.most_common(20))
        
        # Create wordcloud
        combined_text = ' '.join(all_text)
        wordcloud = WordCloud(
            width=1600,
            height=800,
            background_color='white',
            max_words=40,
            random_state=42,
            prefer_horizontal=0.7
        ).generate_from_frequencies(dict(word_freq.most_common(40)))
        
        plt.figure(figsize=(20,10))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Top 40 Keywords', fontsize=20, pad=20)
        plt.tight_layout(pad=0)
        plt.savefig(f'{self.img_dir}/{self.report_prefix}_wordcloud.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        # Print filtering statistics
        total_words = len(all_words)
        unique_words = len(set(all_words))
        
        print("\nFiltering Statistics:")
        print(f"Total words after filtering: {total_words}")
        print(f"Unique words after filtering: {unique_words}")
        print("\nTop 20 words after filtering:")
        for word, count in word_freq.most_common(20):
            print(f"{word}: {count}")

    def analyze_sentiment(self, text):

        sentiment_scores = self.sia.polarity_scores(text)
        
        # Classify sentiment based on compound score
        if sentiment_scores['compound'] >= 0.05:
            sentiment_label = "Positive"
        elif sentiment_scores['compound'] <= -0.05:
            sentiment_label = "Negative"
        else:
            sentiment_label = "Neutral"
            
        return sentiment_scores, sentiment_label

    def perform_sentiment_analysis(self):

        sentiments = []
        detailed_sentiments = []
        pos_count = 0
        neg_count = 0
        neu_count = 0
        
        for content in self.df['content']:
            cleaned_content = self.clean_text(content)
            sentiment_scores, sentiment_label = self.analyze_sentiment(cleaned_content)
            
            sentiments.append(sentiment_scores['compound'])
            detailed_sentiments.append({
                'pos': sentiment_scores['pos'],
                'neg': sentiment_scores['neg'],
                'neu': sentiment_scores['neu'],
                'compound': sentiment_scores['compound'],
                'label': sentiment_label
            })
            
            if sentiment_label == "Positive":
                pos_count += 1
            elif sentiment_label == "Negative":
                neg_count += 1
            else:
                neu_count += 1

        self.df['sentiment'] = sentiments
        self.df['detailed_sentiment'] = detailed_sentiments
        
        # Store sentiment analysis results
        self.report_data['sentiment_analysis'] = {
            'mean_compound': float(np.mean(sentiments)),
            'positive_posts': pos_count,
            'negative_posts': neg_count,
            'neutral_posts': neu_count,
            'sentiment_distribution': {
                'positive': pos_count / len(sentiments),
                'negative': neg_count / len(sentiments),
                'neutral': neu_count / len(sentiments)
            }
        }

        # Create visualization
        self.visualize_sentiment_analysis(sentiments, pos_count, neg_count, neu_count)


    def visualize_sentiment_analysis(self, sentiments, pos_count, neg_count, neu_count):

        # Compound score distribution
        plt.figure(figsize=(8, 6))
        sns.histplot(sentiments, bins=20, color='skyblue', edgecolor='black')
        plt.title('Distribution of Compound Sentiment Scores on Reddit')
        plt.xlabel('Compound Score')
        plt.ylabel('Count')
        plt.savefig(f'{self.img_dir}/{self.report_prefix}_sentiment_dist.png')
        plt.close()
        
        # Sentiment categories pie chart
        plt.figure(figsize=(8, 6))
        plt.pie([pos_count, neu_count, neg_count],
                labels=['Positive', 'Neutral', 'Negative'],
                colors=['green', 'gray', 'red'],
                autopct='%1.1f%%')
        plt.title('Overall Sentiment Distribution')
        plt.savefig(f'{self.img_dir}/{self.report_prefix}_sentiment_pie.png')
        plt.close()
        
    # Quarterly sentiment percentages with line graph and yearly labels
        self.df['quarter'] = self.df['timestamp'].dt.to_period('Q')
        
        # Calculate counts first
        quarterly_counts = pd.DataFrame({
            'Positive': self.df[self.df['sentiment'] >= 0.05].groupby('quarter').size(),
            'Neutral': self.df[(self.df['sentiment'] > -0.05) & (self.df['sentiment'] < 0.05)].groupby('quarter').size(),
            'Negative': self.df[self.df['sentiment'] <= -0.05].groupby('quarter').size()
        })
        
        # Convert to percentages
        quarterly_percentages = quarterly_counts.div(quarterly_counts.sum(axis=1), axis=0) * 100
        
        plt.figure(figsize=(12, 6))
        for column in quarterly_percentages.columns:
            plt.plot(range(len(quarterly_percentages)), quarterly_percentages[column], 
                    marker='o', label=column)
        
        # Set x-ticks to show only years
        years = [str(q).split('Q')[0] for q in quarterly_percentages.index]
        unique_years = sorted(list(set(years)))
        year_positions = [i for i, year in enumerate(years) if year in unique_years and years.index(year) == i]
        
        plt.xticks(year_positions, unique_years, rotation=0)
        plt.title('Quarterly Sentiment Distribution (%)')
        plt.xlabel('Year')
        plt.ylabel('Percentage')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.ylim(0, 100)  # Set y-axis from 0 to 100%
        plt.tight_layout()
        plt.savefig(f'{self.img_dir}/{self.report_prefix}_sentiment_quarterly.png')
        plt.close()

    def analyze_quarterly_counts(self):

        # Group posts by quarter and count
        quarterly_counts = self.df.groupby(self.df['timestamp'].dt.to_period('Q')).size()
        
        # Create visualization
        plt.figure(figsize=(12, 6))
        quarterly_counts.plot(kind='bar')
        
        # Customize the plot
        plt.title('Quarterly Post Count Distribution')
        plt.xlabel('Quarter')
        plt.ylabel('Number of Posts')
        plt.xticks(rotation=45)
        plt.grid(True, linestyle='--', alpha=0.7)
        
        # Add value labels on top of each bar
        for i, v in enumerate(quarterly_counts):
            plt.text(i, v, str(v), ha='center', va='bottom')
        
        plt.tight_layout()
        plt.savefig(f'{self.img_dir}/{self.report_prefix}_quarterly_counts.png')
        plt.close()
        
        # Add quarterly counts to report data
        self.report_data['quarterly_counts'] = quarterly_counts.to_dict()
        
        # Print quarterly statistics
        print("\nQuarterly Post Counts:")
        for quarter, count in quarterly_counts.items():
            print(f"{quarter}: {count} posts")

    def analyze_yearly_top_keywords(self):

        # Add year column to dataframe
        self.df['year'] = self.df['timestamp'].dt.year
        
        # Dictionary to store yearly keywords
        yearly_keywords = {}
        
        # Process each year
        for year in sorted(self.df['year'].unique()):
            year_data = self.df[self.df['year'] == year]
            
            # Get all words for this year
            all_words = []
            for content in year_data['content']:
                cleaned_text = self.clean_text(content)
                cleaned_words = cleaned_text.split()
                all_words.extend(cleaned_words)
            
            # Count word frequencies for this year
            word_freq = Counter(all_words)
            
            # Store top 3 words
            yearly_keywords[year] = dict(word_freq.most_common(3))
        
        # Store in report data
        self.report_data['yearly_top_keywords'] = yearly_keywords
        
        # Print yearly statistics
        print("\nYearly Top 3 Keywords:")
        for year, keywords in yearly_keywords.items():
            print(f"\n{year}:")
            for word, count in keywords.items():
                print(f"  {word}: {count}")


    def run_analysis(self):

        print(f"Starting analysis of: {self.input_file}")
        
        try:
            print("\nAnalyzing filtered content...")
            self.analyze_filtered_content()
            
            print("\nPerforming sentiment analysis...")
            self.perform_sentiment_analysis()
            
            print("\nAnalyzing engagement patterns...")
            self.analyze_engagement()
            
            print("\nAnalyzing quarterly post counts...")
            self.analyze_quarterly_counts()
            
            print("\nAnalyzing yearly top keywords...")
            self.analyze_yearly_top_keywords()
            
            print("\nSaving report...")
            self.save_report_data()
            
            print("\nAnalysis complete!")
            
        except Exception as e:
            print(f"Error during analysis: {str(e)}")
            raise
            

    def get_output_paths(self):

        return {
            'txt_report':  f'{self.img_dir}/{self.report_prefix}.txt',
            'time_series': f'{self.img_dir}/{self.report_prefix}_time_series.png',
            'sentiment': f'{self.img_dir}/{self.report_prefix}_sentiment_analysis.png',
            'engagement': f'{self.img_dir}/{self.report_prefix}_engagement_distribution.png',
            'keyword_freq': f'{self.img_dir}/{self.report_prefix}_keyword_frequency.png'
        }
    
    def analyze_engagement(self):

        plt.figure(figsize=(10, 6))
        sns.histplot(data=self.df, x='engagement', bins=20)
        plt.title('Distribution of Engagement')
        plt.xlabel('Engagement Score')
        plt.ylabel('Number of Posts')
        plt.savefig(self.get_output_paths()['engagement'])
        plt.close()
    


    
    def save_report_data(self):

        paths = self.get_output_paths()
        with open(paths['txt_report'], 'w', encoding='utf-8') as f:
            f.write("FOMO Analysis Report\n")
            f.write("===================\n\n")
            
            f.write(f"Analysis Timestamp: {self.report_data['timestamp']}\n")
            f.write(f"Input File: {self.report_data['input_file']}\n")
            f.write(f"Total Posts: {self.report_data['total_posts']}\n")
            f.write(f"Date Range: {self.report_data['date_range']}\n\n")
            
            f.write("Sentiment Analysis Results\n")
            f.write("------------------------\n")
            sentiment_data = self.report_data['sentiment_analysis']
            f.write(f"Mean Compound Score: {sentiment_data['mean_compound']:.3f}\n")
            f.write(f"Positive Posts: {sentiment_data['positive_posts']} ")
            f.write(f"({sentiment_data['sentiment_distribution']['positive']:.1%})\n")
            f.write(f"Neutral Posts: {sentiment_data['neutral_posts']} ")
            f.write(f"({sentiment_data['sentiment_distribution']['neutral']:.1%})\n")
            f.write(f"Negative Posts: {sentiment_data['negative_posts']} ")
            f.write(f"({sentiment_data['sentiment_distribution']['negative']:.1%})\n\n")
            
            if 'engagement_stats' in self.report_data:
                f.write("Engagement Statistics\n")
                f.write("--------------------\n")
                for stat, value in self.report_data['engagement_stats'].items():
                    f.write(f"{stat.capitalize()}: {value:.2f}\n")
                f.write("\n")
            if 'quarterly_counts' in self.report_data:
                f.write("Quarterly Post Counts\n")
                f.write("-------------------\n")
                for quarter, count in self.report_data['quarterly_counts'].items():
                    f.write(f"{quarter}: {count} posts\n")
                f.write("\n")




    

if __name__ == "__main__":
   # With custom blacklist file
    analyzer_with_blacklist = FOMOAnalyzer(
        'v3_reddit_fomo_data.csv', 
        'FOMO_reports', 
        'v3_reddit_analysis',
    )
    analyzer_with_blacklist.run_analysis()

Starting analysis of: v3_reddit_fomo_data.csv

Analyzing filtered content...

Filtering Statistics:
Total words after filtering: 96466
Unique words after filtering: 8964

Top 20 words after filtering:
feel: 1431
people: 1261
anxiety: 1082
time: 972
get: 954
friends: 950
know: 902
life: 858
want: 768
fomo: 747
social: 697
think: 591
make: 504
something: 498
help: 464
feeling: 463
way: 440
much: 426
please: 419
see: 400

Performing sentiment analysis...

Analyzing engagement patterns...

Analyzing quarterly post counts...

Quarterly Post Counts:
2014Q3: 2 posts
2015Q1: 14 posts
2015Q2: 3 posts
2015Q3: 5 posts
2015Q4: 19 posts
2016Q1: 4 posts
2016Q2: 25 posts
2016Q3: 1 posts
2016Q4: 12 posts
2017Q1: 6 posts
2017Q3: 4 posts
2017Q4: 12 posts
2018Q1: 18 posts
2018Q2: 8 posts
2018Q3: 25 posts
2018Q4: 16 posts
2019Q1: 18 posts
2019Q2: 21 posts
2019Q3: 49 posts
2019Q4: 12 posts
2020Q1: 22 posts
2020Q2: 31 posts
2020Q3: 62 posts
2020Q4: 47 posts
2021Q1: 131 posts
2021Q2: 71 posts
2021Q3: 51 post