In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
import os

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [6]:
# Create output directory
output_dir = "FOMO_reports/youtube"
os.makedirs(output_dir, exist_ok=True)

def clean_and_prepare_data(file_path):
    """Load and clean data with error handling for UTF-8 encoding"""
    try:
        # First attempt with UTF-8 encoding
        data = pd.read_csv(file_path, encoding='utf-8')
    except UnicodeDecodeError:
        try:
            # Second attempt with Latin-1 encoding
            data = pd.read_csv(file_path, encoding='latin1')
        except Exception as e:
            # If both attempts fail, try with CP1252 encoding
            data = pd.read_csv(file_path, encoding='cp1252')
    
    # Clean content
    data['content'] = data['content'].apply(lambda x: str(x) if pd.notnull(x) else '')
    data['content'] = data['content'].apply(lambda x: re.sub(r'http\S+', '', x))
    data['content'] = data['content'].apply(lambda x: re.sub(r'@\w+', '', x))
    data['content'] = data['content'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
    data['content'] = data['content'].str.lower()
    data['content'] = data['content'].str.strip()
    data['content'] = data['content'].replace('', np.nan)
    
    # Filter comments
    word_counts = data['content'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)
    data = data[word_counts.between(10, 200)]
    return data.dropna(subset=['content'])

# Define custom stop words and combine with NLTK stop words
nltk_stop_words = set(stopwords.words('english'))
custom_stop_words = ['video','ive','really', 'also', 'even', 'know', 'get', 'one', 'even', 'jaiden', 
                    'im', 'like', 'dont', 'want', 'go', 'yeah', 'oh', 'well', 'okay', 'ya', 
                    'yep', 'nah', 'nope', 'hmm', 'huh', 'uh', 'um', 'err', 'ah', 'er', 'hey', 
                    'wow', 'oops', 'ooh', 'whoa', 'haha', 'lol', 'lmao', 'rofl', 'omg', 'wtf', 
                    'damn', 'dang', 'shit', 'fuck', 'crap', 'ass', 'bitch', 'hell', 'damn', 
                    'freaking', 'bloody', 'freakin', 'frickin', 'fricking', 'friggin', 'friggen', 
                    'gosh', 'geez', 'darn', 'dang', 'cuz', 'coz', 'cause', 'cos', 'dunno', 
                    'gonna', 'gotta', 'wanna', 'gonna', 'kinda', 'sorta', 'coulda', 'shoulda', 
                    'woulda', 'lemme', 'prolly', 'bruh', 'bro', 'dude', 'man', 'guys', 'mate', 
                    'mates', 'folks', 'peeps', 'peepz', 'peepol', 'ppl', 'yall']
all_stop_words = nltk_stop_words.union(set(custom_stop_words))

def process_text(text):
    """Process and filter text using NLTK"""
    # Tokenize
    tokens = word_tokenize(text)
    
    # Filter tokens
    filtered_tokens = [
        word.lower() for word in tokens 
        if word.lower() not in all_stop_words 
        and len(word) > 2 
        and word.isalpha()
    ]
    
    return filtered_tokens

def analyze_sentiment(data):
    """Perform sentiment analysis and create plots"""
    analyzer = SentimentIntensityAnalyzer()
    data['vader_sentiment'] = data['content'].apply(lambda text: analyzer.polarity_scores(text)['compound'])
    
    # Plot sentiment distribution
    plt.figure(figsize=(8, 6))
    plt.hist(data['vader_sentiment'], bins=20, color='skyblue', edgecolor='black')
    plt.title('Distribution of Compound Sentiment Scores on Youtube')
    plt.xlabel('Sentiment Score')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(f'{output_dir}/sentiment_distribution.png')
    plt.close()

def analyze_keywords(data):
    """Analyze keywords using NLTK and create visualizations"""
    # Process all comments
    all_tokens = []
    all_text = []
    
    for comment in data['content']:
        processed_tokens = process_text(comment)
        all_tokens.extend(processed_tokens)
        all_text.append(' '.join(processed_tokens))
    
    # Word frequency analysis
    word_counts = Counter(all_tokens)
    
    # Plot top 20 keywords
    plt.figure(figsize=(12, 8))
    top_words = word_counts.most_common(20)
    plt.bar(range(20), [count for _, count in top_words], color='skyblue')
    plt.xticks(range(20), [word for word, _ in top_words], rotation=45, ha='right')
    plt.title("Top 20 Keyword Counts (NLTK Filtered)")
    plt.xlabel("Keyword")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(f'{output_dir}/top_keywords.png')
    plt.close()
    
    # Create wordcloud
    # Get the top 40 words and their frequencies
    top_40_words = dict(word_counts.most_common(40))
    
    # Calculate frequency-based font sizes
    max_freq = max(top_40_words.values())
    min_freq = min(top_40_words.values())
    
    # Create wordcloud with frequency-based sizing
    # Get the top 40 words and their frequencies
    top_40_words = dict(word_counts.most_common(40))
    
    # Create wordcloud
    wordcloud = WordCloud(
        width=1600,
        height=800,
        background_color='white',
        stopwords=all_stop_words,
        max_words=40,
        random_state=42,
        prefer_horizontal=0.7
    ).generate_from_frequencies(top_40_words)
    
    plt.figure(figsize=(20,10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Top 40 Keywords', fontsize=20, pad=20)
    plt.tight_layout(pad=0)
    plt.savefig(f'{output_dir}/wordcloud.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return word_counts

def save_analysis_report(data, word_counts):
    """Save analysis results to a text file with encoding error handling"""
    try:
        with open(f'{output_dir}/analysis_report.txt', 'w', encoding='utf-8') as f:
            write_report_content(f, data, word_counts)
    except UnicodeEncodeError:
        with open(f'{output_dir}/analysis_report.txt', 'w', encoding='cp1252') as f:
            write_report_content(f, data, word_counts)

def write_report_content(f, data, word_counts):
    """Helper function to write the actual report content"""
    f.write("YouTube Comments Analysis Report\n")
    f.write("==============================\n\n")
    
    f.write("Dataset Statistics:\n")
    f.write(f"Total comments analyzed: {len(data)}\n")
    f.write(f"Average sentiment score: {data['vader_sentiment'].mean():.3f}\n")
    f.write(f"Positive comments: {len(data[data['vader_sentiment'] > 0])} ({len(data[data['vader_sentiment'] > 0])/len(data)*100:.1f}%)\n")
    f.write(f"Negative comments: {len(data[data['vader_sentiment'] < 0])} ({len(data[data['vader_sentiment'] < 0])/len(data)*100:.1f}%)\n")
    f.write(f"Neutral comments: {len(data[data['vader_sentiment'] == 0])} ({len(data[data['vader_sentiment'] == 0])/len(data)*100:.1f}%)\n\n")
    
    f.write("Sentiment Statistics:\n")
    f.write(f"Maximum sentiment score: {data['vader_sentiment'].max():.3f}\n")
    f.write(f"Minimum sentiment score: {data['vader_sentiment'].min():.3f}\n")
    f.write(f"Median sentiment score: {data['vader_sentiment'].median():.3f}\n")
    f.write(f"Standard deviation: {data['vader_sentiment'].std():.3f}\n\n")
    
    f.write("Top 20 Keywords (NLTK Filtered):\n")
    for word, count in word_counts.most_common(20):
        f.write(f"{word}: {count}\n")
def main():
    # Load and process data
    print("Loading and cleaning data...")
    data = clean_and_prepare_data('v2_youtube_comments.csv')
    
    # Perform sentiment analysis
    print("Performing sentiment analysis...")
    analyze_sentiment(data)
    
    # Analyze keywords
    print("Analyzing keywords...")
    word_counts = analyze_keywords(data)
    
    # Save report
    print("Saving analysis report...")
    save_analysis_report(data, word_counts)
    
    print(f"Analysis complete. Results saved in {output_dir}/")

if __name__ == "__main__":
    main()



Loading and cleaning data...


  data = pd.read_csv(file_path, encoding='latin1')


Performing sentiment analysis...
Analyzing keywords...
Saving analysis report...
Analysis complete. Results saved in FOMO_reports/youtube/
