<a href="https://colab.research.google.com/github/TCU-DCDA/WRIT20833-2025/blob/main/notebooks/exercises/Review_07_Text_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WRIT 20833 Review 07: Text Analysis & Sentiment Analysis

Analyze cultural texts and measure emotional sentiment computationally.

**Make a copy:** File > Save a copy in Drive

## Exercise 1: Setting Up Text Analysis Tools
Install and import libraries for text and sentiment analysis.

In [None]:
# Install required packages (only run once in Colab)
# !pip install vaderSentiment textstat

# Import libraries
import pandas as pd
import re
from collections import Counter
import matplotlib.pyplot as plt

# Import sentiment analysis tools
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    sentiment_analyzer = SentimentIntensityAnalyzer()
    print(" VADER Sentiment Analyzer loaded successfully")
except ImportError:
    print(" VADER not installed. Run: !pip install vaderSentiment")
    # Fallback simple sentiment function
    def simple_sentiment(text):
        positive_words = ['good', 'great', 'excellent', 'amazing', 'wonderful', 'love', 'beautiful', 'brilliant']
        negative_words = ['bad', 'terrible', 'awful', 'horrible', 'hate', 'ugly', 'stupid', 'worst']
        
        text_lower = text.lower()
        pos_count = sum(word in text_lower for word in positive_words)
        neg_count = sum(word in text_lower for word in negative_words)
        
        if pos_count > neg_count:
            return {'compound': 0.5, 'sentiment': 'positive'}
        elif neg_count > pos_count:
            return {'compound': -0.5, 'sentiment': 'negative'}
        else:
            return {'compound': 0.0, 'sentiment': 'neutral'}

# Test sentiment analysis
sample_texts = [
    "This book is absolutely wonderful and brilliant!",
    "I hate this terrible, awful story.",
    "The weather is nice today."
]

print("\nTesting sentiment analysis:")
for text in sample_texts:
    try:
        scores = sentiment_analyzer.polarity_scores(text)
        sentiment = 'positive' if scores['compound'] > 0.1 else 'negative' if scores['compound'] < -0.1 else 'neutral'
        print(f"Text: \"{text}\"")
        print(f"Sentiment: {sentiment} (score: {scores['compound']:.2f})")
        print()
    except NameError:
        result = simple_sentiment(text)
        print(f"Text: \"{text}\"")
        print(f"Sentiment: {result['sentiment']} (score: {result['compound']:.2f})")
        print()

## Exercise 2: Basic Text Analysis Functions
Create functions to analyze text characteristics.

In [None]:
# Text analysis functions
def analyze_text_basics(text):
    \"\"\"Perform basic text analysis\"\"\" 
    
    # Basic counts
    char_count = len(text)
    word_count = len(text.split())
    sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()])
    
    # Calculate averages
    avg_word_length = sum(len(word.strip('.,!?;:')) for word in text.split()) / word_count if word_count > 0 else 0
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else word_count
    
    # Vocabulary richness
    words = [word.lower().strip('.,!?;:"') for word in text.split()]
    unique_words = len(set(words))
    lexical_diversity = unique_words / word_count if word_count > 0 else 0
    
    return {
        'char_count': char_count,
        'word_count': word_count,
        'sentence_count': sentence_count,
        'avg_word_length': round(avg_word_length, 2),
        'avg_sentence_length': round(avg_sentence_length, 2),
        'unique_words': unique_words,
        'lexical_diversity': round(lexical_diversity, 3)
    }

def get_word_frequencies(text, top_n=10):
    \"\"\"Get most frequent words in text\"\"\" 
    
    # Simple stopwords list
    stopwords = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 
                 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
                 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must',
                 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
                 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'this', 'that', 'these', 'those'}
    
    # Clean and count words
    words = [word.lower().strip('.,!?;:"()') for word in text.split()]
    content_words = [word for word in words if word and word not in stopwords and len(word) > 2]
    
    word_freq = Counter(content_words)
    return word_freq.most_common(top_n)

# Test with literary examples
shakespeare_quote = \"\"\"To be, or not to be, that is the question: 
Whether 'tis nobler in the mind to suffer 
The slings and arrows of outrageous fortune, 
Or to take arms against a sea of troubles 
And by opposing end them.\"\"\"

print("Text Analysis of Shakespeare's Hamlet:")
analysis = analyze_text_basics(shakespeare_quote)
for key, value in analysis.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

print("\nMost frequent content words:")
freq_words = get_word_frequencies(shakespeare_quote, 5)
for word, count in freq_words:
    print(f"{word}: {count}")

## Exercise 3: Sentiment Analysis of Cultural Texts
Analyze emotional sentiment in various cultural works.

In [None]:
# Function to analyze sentiment
def analyze_sentiment(text, title=\"Text\"):
    \"\"\"Analyze sentiment of text using VADER or fallback method\"\"\" 
    
    try:
        # Use VADER if available
        scores = sentiment_analyzer.polarity_scores(text)
        
        # Determine overall sentiment
        if scores['compound'] >= 0.05:
            overall = 'Positive'
        elif scores['compound'] <= -0.05:
            overall = 'Negative'
        else:
            overall = 'Neutral'
        
        return {
            'title': title,
            'positive': scores['pos'],
            'neutral': scores['neu'],
            'negative': scores['neg'],
            'compound': scores['compound'],
            'overall': overall
        }
    
    except NameError:
        # Fallback method
        result = simple_sentiment(text)
        return {
            'title': title,
            'positive': 0.5 if result['sentiment'] == 'positive' else 0.0,
            'neutral': 0.5 if result['sentiment'] == 'neutral' else 0.0,
            'negative': 0.5 if result['sentiment'] == 'negative' else 0.0,
            'compound': result['compound'],
            'overall': result['sentiment'].title()
        }

# Cultural text samples for analysis
cultural_texts = {
    \"MLK Dream\": \"\"\"I have a dream that one day this nation will rise up and live out the true meaning of its creed: 
    We hold these truths to be self-evident, that all men are created equal. I have a dream that one day on the red hills 
    of Georgia, the sons of former slaves and the sons of former slave owners will be able to sit down together at the 
    table of brotherhood.\"\"\",
    
    \"Poe Raven\": \"\"\"Once upon a midnight dreary, while I pondered, weak and weary, 
    Over many a quaint and curious volume of forgotten lore While I nodded, nearly napping, 
    suddenly there came a tapping, As of some one gently rapping, rapping at my chamber door. 
    'Tis some visitor,' I muttered, 'tapping at my chamber door Only this and nothing more.'\"\"\",
    
    \"Austen Pride\": \"\"\"It is a truth universally acknowledged, that a single man in possession of a good fortune, 
    must be in want of a wife. However little known the feelings or views of such a man may be on his first 
    entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, 
    that he is considered the rightful property of some one or other of their daughters.\"\"\",
    
    \"Orwell 1984\": \"\"\"It was a bright cold day in April, and the clocks were striking thirteen. 
    Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, 
    slipped quickly through the glass doors of Victory Mansions, though not quickly enough 
    to prevent a swirl of gritty dust from entering along with him.\"\"\"
}

# Analyze sentiment for each text
print(\"SENTIMENT ANALYSIS OF CULTURAL TEXTS\")
print(\"=\" * 50)

sentiment_results = []
for title, text in cultural_texts.items():
    result = analyze_sentiment(text, title)
    sentiment_results.append(result)
    
    print(f\"\n{title}:")
    print(f\"  Overall Sentiment: {result['overall']}\")
    print(f\"  Compound Score: {result['compound']:.3f}\")
    print(f\"  Positive: {result['positive']:.2f} | Neutral: {result['neutral']:.2f} | Negative: {result['negative']:.2f}\")

# Create summary DataFrame
sentiment_df = pd.DataFrame(sentiment_results)
print(f\"\nSENTIMENT SUMMARY:\")
print(sentiment_df[['title', 'overall', 'compound']].to_string(index=False))

# Sentiment distribution
sentiment_counts = sentiment_df['overall'].value_counts()
print(f\"\nSentiment Distribution:\")
for sentiment, count in sentiment_counts.items():
    print(f\"{sentiment}: {count} texts\")

## Exercise 4: Comparative Text Analysis
Compare multiple texts across different dimensions.

In [None]:
# Function for comprehensive text comparison
def comprehensive_text_analysis(texts_dict):
    \"\"\"Perform comprehensive analysis on multiple texts\"\"\" 
    
    results = []
    
    for title, text in texts_dict.items():
        # Basic text analysis
        basic_stats = analyze_text_basics(text)
        
        # Sentiment analysis
        sentiment = analyze_sentiment(text, title)
        
        # Word frequency analysis
        top_words = get_word_frequencies(text, 3)
        
        # Combine all analysis
        result = {
            'title': title,
            'word_count': basic_stats['word_count'],
            'sentence_count': basic_stats['sentence_count'],
            'avg_word_length': basic_stats['avg_word_length'],
            'lexical_diversity': basic_stats['lexical_diversity'],
            'sentiment_overall': sentiment['overall'],
            'sentiment_score': sentiment['compound'],
            'top_words': [word for word, count in top_words]
        }
        
        results.append(result)
    
    return results

# Perform comprehensive analysis
comprehensive_results = comprehensive_text_analysis(cultural_texts)

# Convert to DataFrame for analysis
analysis_df = pd.DataFrame(comprehensive_results)

print(\"COMPREHENSIVE TEXT ANALYSIS\")
print(\"=\" * 50)
print(analysis_df[['title', 'word_count', 'avg_word_length', 'lexical_diversity', 'sentiment_overall']].to_string(index=False))

print(f\"\nSTATISTICAL SUMMARY:\")
print(f\"Average word count: {analysis_df['word_count'].mean():.1f}\")
print(f\"Average word length: {analysis_df['avg_word_length'].mean():.2f} characters\")
print(f\"Average lexical diversity: {analysis_df['lexical_diversity'].mean():.3f}\")
print(f\"Average sentiment score: {analysis_df['sentiment_score'].mean():.3f}\")

# Find extremes
print(f\"\nEXTREME VALUES:\")
most_words = analysis_df.loc[analysis_df['word_count'].idxmax()]
print(f\"Most words: {most_words['title']} ({most_words['word_count']} words)\")

most_diverse = analysis_df.loc[analysis_df['lexical_diversity'].idxmax()]
print(f\"Most lexically diverse: {most_diverse['title']} ({most_diverse['lexical_diversity']:.3f})\")

most_positive = analysis_df.loc[analysis_df['sentiment_score'].idxmax()]
print(f\"Most positive: {most_positive['title']} (score: {most_positive['sentiment_score']:.3f})\")

most_negative = analysis_df.loc[analysis_df['sentiment_score'].idxmin()]
print(f\"Most negative: {most_negative['title']} (score: {most_negative['sentiment_score']:.3f})\")

# Group by sentiment
print(f\"\nGROUPED BY SENTIMENT:\")
sentiment_groups = analysis_df.groupby('sentiment_overall').agg({
    'word_count': 'mean',
    'avg_word_length': 'mean',
    'lexical_diversity': 'mean'
})
print(sentiment_groups.round(2))

## Exercise 5: Emotional Word Analysis
Identify and categorize emotional language in texts.

In [None]:
# Emotion word dictionaries (simplified)
emotion_words = {
    'joy': ['happy', 'joy', 'delight', 'cheerful', 'glad', 'pleased', 'elated', 'ecstatic', 'blissful', 'radiant'],
    'sadness': ['sad', 'sorrow', 'grief', 'melancholy', 'despair', 'gloom', 'misery', 'anguish', 'heartbreak'],
    'anger': ['angry', 'rage', 'fury', 'wrath', 'irritated', 'annoyed', 'furious', 'livid', 'incensed'],
    'fear': ['fear', 'afraid', 'scared', 'terrified', 'anxious', 'worried', 'nervous', 'frightened', 'alarmed'],
    'love': ['love', 'adore', 'cherish', 'affection', 'devotion', 'passion', 'romance', 'tender', 'beloved'],
    'hope': ['hope', 'optimism', 'faith', 'trust', 'confidence', 'aspiration', 'dream', 'wish', 'believe']
}

def analyze_emotions(text, title=\"Text\"):
    \"\"\"Analyze emotional content in text\"\"\" 
    
    # Clean text for analysis
    words = re.findall(r'\\b\\w+\\b', text.lower())
    
    # Count emotional words
    emotion_counts = {}
    emotion_matches = {}
    
    for emotion, word_list in emotion_words.items():
        matches = [word for word in words if word in word_list]
        emotion_counts[emotion] = len(matches)
        emotion_matches[emotion] = matches
    
    # Calculate emotional intensity
    total_emotion_words = sum(emotion_counts.values())
    total_words = len(words)
    emotional_density = total_emotion_words / total_words if total_words > 0 else 0
    
    # Find dominant emotion
    dominant_emotion = max(emotion_counts, key=emotion_counts.get) if total_emotion_words > 0 else 'neutral'
    
    return {
        'title': title,
        'emotion_counts': emotion_counts,
        'emotion_matches': emotion_matches,
        'emotional_density': round(emotional_density, 4),
        'dominant_emotion': dominant_emotion,
        'total_emotion_words': total_emotion_words
    }

# Analyze emotions in cultural texts
print(\"EMOTIONAL WORD ANALYSIS\")
print(\"=\" * 40)

emotion_results = []
for title, text in cultural_texts.items():
    emotion_analysis = analyze_emotions(text, title)
    emotion_results.append(emotion_analysis)
    
    print(f\"\n{title}:\")
    print(f\"  Dominant emotion: {emotion_analysis['dominant_emotion']}\")
    print(f\"  Emotional density: {emotion_analysis['emotional_density']:.1%}\")
    print(f\"  Total emotion words: {emotion_analysis['total_emotion_words']}\")
    
    # Show emotion breakdown
    print(f\"  Emotion breakdown:\")
    for emotion, count in emotion_analysis['emotion_counts'].items():
        if count > 0:
            matches = emotion_analysis['emotion_matches'][emotion]
            print(f\"    {emotion.title()}: {count} ({', '.join(matches)})\")

# Summary statistics
print(f\"\n\nEMOTIONAL SUMMARY:\")
print(\"=\" * 20)

# Aggregate emotion counts across all texts
total_emotions = {emotion: 0 for emotion in emotion_words.keys()}
for result in emotion_results:
    for emotion, count in result['emotion_counts'].items():
        total_emotions[emotion] += count

print(\"Total emotional words across all texts:\")
for emotion, total in sorted(total_emotions.items(), key=lambda x: x[1], reverse=True):
    print(f\"{emotion.title()}: {total}\")

# Most emotionally dense text
most_emotional = max(emotion_results, key=lambda x: x['emotional_density'])
print(f\"\nMost emotionally dense text: {most_emotional['title']} ({most_emotional['emotional_density']:.1%})\")

least_emotional = min(emotion_results, key=lambda x: x['emotional_density'])
print(f\"Least emotionally dense text: {least_emotional['title']} ({least_emotional['emotional_density']:.1%})\")

## Exercise 6: Creating Your Own Text Analysis
Apply text analysis to your own cultural texts of interest.

In [None]:
# TODO: Add your own texts for analysis
# Consider: Song lyrics, poems, speeches, book excerpts, movie quotes, etc.

your_texts = {
    # TODO: Replace these with texts that interest you
    \"Sample Text 1\": \"\"\"Replace this with a text you want to analyze. 
    This could be song lyrics, a poem, a speech excerpt, or any cultural text 
    that interests you. Make sure it's long enough for meaningful analysis.\"\"\",
    
    \"Sample Text 2\": \"\"\"Add another text here for comparison. 
    Consider choosing texts from different genres, time periods, or cultural contexts 
    to see how they differ in sentiment and emotional content.\"\"\",
    
    \"Sample Text 3\": \"\"\"A third text allows for richer comparison. 
    You might want to include texts that you hypothesize will have different 
    emotional tones or writing styles.\"\"\"
}

# TODO: Perform your analysis
print(\"YOUR TEXT ANALYSIS PROJECT\")
print(\"=\" * 40)

if len(your_texts) > 0 and list(your_texts.values())[0] != \"\"\"Replace this with a text you want to analyze. 
    This could be song lyrics, a poem, a speech excerpt, or any cultural text 
    that interests you. Make sure it's long enough for meaningful analysis.\"\"\":
    
    # Perform comprehensive analysis on your texts
    your_results = comprehensive_text_analysis(your_texts)
    your_df = pd.DataFrame(your_results)
    
    print(\"Basic Analysis Results:\")
    print(your_df[['title', 'word_count', 'sentiment_overall', 'sentiment_score']].to_string(index=False))
    
    print(f\"\nYour Analysis Summary:\")
    print(f\"Most positive text: {your_df.loc[your_df['sentiment_score'].idxmax(), 'title']}\")
    print(f\"Most negative text: {your_df.loc[your_df['sentiment_score'].idxmin(), 'title']}\")
    print(f\"Longest text: {your_df.loc[your_df['word_count'].idxmax(), 'title']}\")
    print(f\"Most diverse vocabulary: {your_df.loc[your_df['lexical_diversity'].idxmax(), 'title']}\")
    
    # Emotional analysis of your texts
    print(f\"\nEmotional Analysis of Your Texts:\")
    for title, text in your_texts.items():
        emotion_result = analyze_emotions(text, title)
        print(f\"\n{title}:\")
        print(f\"  Dominant emotion: {emotion_result['dominant_emotion']}\")
        print(f\"  Emotional density: {emotion_result['emotional_density']:.1%}\")
        
        # Show top emotions
        sorted_emotions = sorted(emotion_result['emotion_counts'].items(), 
                                key=lambda x: x[1], reverse=True)[:3]
        for emotion, count in sorted_emotions:
            if count > 0:
                print(f\"    {emotion.title()}: {count}\")
else:
    print(\"Please customize the 'your_texts' dictionary above with your own cultural texts!\")
    print(\"Consider analyzing:\")
    print(\"- Song lyrics from different genres\")
    print(\"- Poems from different time periods\")
    print(\"- Political speeches\")
    print(\"- Movie or book quotes\")
    print(\"- Social media posts\")
    print(\"- Historical documents\")

# Research questions to explore
print(f\"\n\" + \"=\"*50)
print(\"RESEARCH QUESTIONS TO EXPLORE:\")
print(\"- How does sentiment vary across different cultural genres?\")
print(\"- What emotional patterns distinguish different historical periods?\")
print(\"- How do authors use emotional language to achieve different effects?\")
print(\"- What can computational analysis reveal about cultural texts that traditional analysis might miss?\")
print(\"- What are the limitations of automated sentiment analysis for cultural interpretation?\")

## Summary

You explored:
- Setting up and using sentiment analysis tools (VADER)
- Creating functions for basic text analysis (word counts, lexical diversity)
- Analyzing sentiment in classic cultural texts
- Comparing multiple texts across various dimensions
- Identifying emotional language patterns
- Applying analysis to your own texts of interest
- Critical evaluation of computational text analysis

**Key Skills:**
- Text preprocessing and cleaning
- Sentiment analysis with compound scores
- Word frequency analysis and stopword removal
- Emotional content categorization
- Comparative text analysis
- Statistical summary of text features

**Key Insights:**
- Computational tools provide quantitative perspectives on cultural texts
- Sentiment analysis can reveal patterns across large collections
- Emotional language density varies significantly between genres and authors
- Automated analysis complements but doesn't replace careful human interpretation
- Cultural context and literary devices require careful consideration

**Next:** Review 08 will cover data visualization for cultural analysis.

---
 