# Twitter Sentiment Analysis

This notebook performs comprehensive sentiment analysis on Twitter data and creates visualizations showing the distribution of positive, negative, and neutral sentiments.

## Features:
- Uses both TextBlob and VADER sentiment analysis
- Creates multiple visualizations including bar charts and pie charts
- Analyzes engagement patterns by sentiment
- Exports results to CSV for further analysis

## 1. Setup and Package Installation

In [None]:
# Install required packages
!pip install textblob vaderSentiment matplotlib seaborn pandas

# Download NLTK data for TextBlob
import nltk
nltk.download('punkt')
nltk.download('brown')
nltk.download('vader_lexicon')

## 2. Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings('ignore')

# Set up matplotlib for Google Colab
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 3. Upload and Load Dataset

Upload your `twitter_dataset.csv` file using the file upload feature in Colab.

In [None]:
from google.colab import files

# Upload the dataset
print("Please upload your twitter_dataset.csv file:")
uploaded = files.upload()

# Get the uploaded filename
dataset_filename = list(uploaded.keys())[0]
print(f"Dataset uploaded: {dataset_filename}")

In [None]:
# Load and examine the dataset
def load_twitter_data(file_path):
    """Load and preprocess Twitter dataset"""
    print("Loading Twitter dataset...")
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully! Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Load the dataset
df = load_twitter_data(dataset_filename)

# Display first few rows
print("\nFirst 5 rows of the dataset:")
df.head()

## 4. Sentiment Analysis Functions

In [None]:
def analyze_sentiment_textblob(text):
    """Analyze sentiment using TextBlob"""
    try:
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        
        if polarity > 0:
            return 'Positive'
        elif polarity < 0:
            return 'Negative'
        else:
            return 'Neutral'
    except:
        return 'Neutral'

def analyze_sentiment_vader(text):
    """Analyze sentiment using VADER"""
    analyzer = SentimentIntensityAnalyzer()
    try:
        scores = analyzer.polarity_scores(text)
        compound = scores['compound']
        
        if compound >= 0.05:
            return 'Positive'
        elif compound <= -0.05:
            return 'Negative'
        else:
            return 'Neutral'
    except:
        return 'Neutral'

print("Sentiment analysis functions defined!")

## 5. Perform Sentiment Analysis

In [None]:
# Perform sentiment analysis
print("Performing sentiment analysis...")

# Clean the text column
df['Text'] = df['Text'].astype(str)

# Apply sentiment analysis using both methods
print("Analyzing with TextBlob...")
df['sentiment_textblob'] = df['Text'].apply(analyze_sentiment_textblob)

print("Analyzing with VADER...")
df['sentiment_vader'] = df['Text'].apply(analyze_sentiment_vader)

# Create a combined sentiment (using VADER as primary)
df['sentiment'] = df['sentiment_vader']

print("Sentiment analysis completed!")
print(f"\nDataset now has {len(df.columns)} columns:")
print(list(df.columns))

## 6. Create Sentiment Visualizations

In [None]:
# Create comprehensive sentiment visualization
print("Creating sentiment visualization...")

# Create figure with subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Twitter Sentiment Analysis Dashboard', fontsize=16, fontweight='bold')

# 1. VADER Sentiment Distribution (Main Chart)
sentiment_counts = df['sentiment_vader'].value_counts()
colors = ['#2E8B57', '#DC143C', '#FFD700']  # Green, Red, Gold
bars1 = ax1.bar(sentiment_counts.index, sentiment_counts.values, color=colors)
ax1.set_title('Sentiment Distribution (VADER)', fontweight='bold')
ax1.set_xlabel('Sentiment')
ax1.set_ylabel('Number of Tweets')

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax1.annotate(f'{int(height)}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

# 2. TextBlob Sentiment Distribution
sentiment_counts_tb = df['sentiment_textblob'].value_counts()
bars2 = ax2.bar(sentiment_counts_tb.index, sentiment_counts_tb.values, color=colors)
ax2.set_title('Sentiment Distribution (TextBlob)', fontweight='bold')
ax2.set_xlabel('Sentiment')
ax2.set_ylabel('Number of Tweets')

# Add value labels on bars
for bar in bars2:
    height = bar.get_height()
    ax2.annotate(f'{int(height)}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

# 3. Percentage Distribution (Pie Chart)
sentiment_percentages = df['sentiment_vader'].value_counts(normalize=True) * 100
ax3.pie(sentiment_percentages.values, labels=sentiment_percentages.index, autopct='%1.1f%%',
        colors=colors, startangle=90)
ax3.set_title('Sentiment Percentage Distribution', fontweight='bold')

# 4. Sentiment by Engagement (Likes vs Sentiment)
sentiment_likes = df.groupby('sentiment_vader')['Likes'].mean()
bars4 = ax4.bar(sentiment_likes.index, sentiment_likes.values, color=colors)
ax4.set_title('Average Likes by Sentiment', fontweight='bold')
ax4.set_xlabel('Sentiment')
ax4.set_ylabel('Average Likes')

# Add value labels on bars
for bar in bars4:
    height = bar.get_height()
    ax4.annotate(f'{height:.1f}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 7. Simple Bar Chart - Positive vs Negative Sentiment

In [None]:
# Create a simple bar chart showing positive vs negative sentiment distribution
plt.figure(figsize=(10, 6))

# Get sentiment counts
sentiment_counts = df['sentiment_vader'].value_counts()

# Create bar chart
colors = {'Positive': '#2E8B57', 'Negative': '#DC143C', 'Neutral': '#FFD700'}
bar_colors = [colors.get(sentiment, '#808080') for sentiment in sentiment_counts.index]

bars = plt.bar(sentiment_counts.index, sentiment_counts.values, color=bar_colors)

# Customize the chart
plt.title('Twitter Sentiment Distribution', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Sentiment', fontsize=12, fontweight='bold')
plt.ylabel('Number of Tweets', fontsize=12, fontweight='bold')

# Add value labels on top of bars
for bar in bars:
    height = bar.get_height()
    plt.annotate(f'{int(height)}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', 
                fontsize=14, fontweight='bold')

# Add percentage labels
total_tweets = len(df)
for i, (sentiment, count) in enumerate(sentiment_counts.items()):
    percentage = (count / total_tweets) * 100
    plt.annotate(f'({percentage:.1f}%)',
                xy=(i, count),
                xytext=(0, -30),
                textcoords="offset points",
                ha='center', va='top',
                fontsize=12, fontweight='bold')

plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nSentiment Analysis Results:")
for sentiment, count in sentiment_counts.items():
    percentage = (count / total_tweets) * 100
    print(f"{sentiment}: {count} tweets ({percentage:.1f}%)")

## 8. Detailed Analysis Summary

In [None]:
# Print comprehensive analysis summary
print("=" * 60)
print("TWITTER SENTIMENT ANALYSIS SUMMARY")
print("=" * 60)

total_tweets = len(df)
print(f"Total tweets analyzed: {total_tweets:,}")

print("\n--- VADER Sentiment Analysis ---")
vader_counts = df['sentiment_vader'].value_counts()
for sentiment, count in vader_counts.items():
    percentage = (count / total_tweets) * 100
    print(f"{sentiment}: {count:,} tweets ({percentage:.1f}%)")

print("\n--- TextBlob Sentiment Analysis ---")
textblob_counts = df['sentiment_textblob'].value_counts()
for sentiment, count in textblob_counts.items():
    percentage = (count / total_tweets) * 100
    print(f"{sentiment}: {count:,} tweets ({percentage:.1f}%)")

print("\n--- Engagement Analysis ---")
engagement_by_sentiment = df.groupby('sentiment_vader')[['Likes', 'Retweets']].mean()
print("Average engagement by sentiment (VADER):")
print(engagement_by_sentiment.round(2))

print("\n--- Sample Tweets by Sentiment ---")
for sentiment in ['Positive', 'Negative', 'Neutral']:
    if sentiment in df['sentiment_vader'].values:
        sample_tweets = df[df['sentiment_vader'] == sentiment]['Text'].head(2)
        print(f"\n{sentiment} tweet examples:")
        for i, tweet in enumerate(sample_tweets, 1):
            print(f"  {i}. \"{tweet[:80]}...\"")

## 9. Export Results

In [None]:
# Save results to CSV
output_file = 'twitter_sentiment_results.csv'
df.to_csv(output_file, index=False)
print(f"Results saved to: {output_file}")

# Download the results file
files.download(output_file)

# Display final dataset structure
print(f"\nFinal dataset shape: {df.shape}")
print(f"New columns added: sentiment_textblob, sentiment_vader, sentiment")
df.head()

## 10. Advanced Analysis (Optional)

In [None]:
# Additional analysis for blockchain/FinTech context
# Filter tweets that might be related to financial topics
financial_keywords = ['money', 'payment', 'bank', 'finance', 'crypto', 'bitcoin', 'blockchain', 'economic']
financial_pattern = '|'.join(financial_keywords)

df['is_financial'] = df['Text'].str.contains(financial_pattern, case=False, na=False)
financial_tweets = df[df['is_financial']]

if len(financial_tweets) > 0:
    print(f"\nFound {len(financial_tweets)} tweets with financial keywords")
    
    # Sentiment distribution for financial tweets
    fin_sentiment = financial_tweets['sentiment_vader'].value_counts()
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(fin_sentiment.index, fin_sentiment.values, 
                   color=['#2E8B57' if x=='Positive' else '#DC143C' if x=='Negative' else '#FFD700' 
                         for x in fin_sentiment.index])
    
    plt.title('Sentiment Distribution - Financial/Crypto Related Tweets', 
              fontsize=14, fontweight='bold')
    plt.xlabel('Sentiment')
    plt.ylabel('Number of Tweets')
    
    # Add labels
    for bar in bars:
        height = bar.get_height()
        plt.annotate(f'{int(height)}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print("\nFinancial tweets sentiment breakdown:")
    for sentiment, count in fin_sentiment.items():
        percentage = (count / len(financial_tweets)) * 100
        print(f"{sentiment}: {count} tweets ({percentage:.1f}%)")
else:
    print("\nNo tweets found with financial keywords in this dataset.")