In [3]:
import pandas as pd
import time

# Twitter Dataset - Simple Pandas Groupby Analysis
print("=== TWITTER DATASET - PANDAS GROUPBY ===")

# Prompt user for CSV file
csv_path = input("Enter path to your Twitter CSV file: ").strip()

# Load the dataset
try:
    twitter_df = pd.read_csv(csv_path)
    print(f" Dataset loaded successfully!")
except Exception as e:
    print(f" Error loading CSV: {e}")
    exit()

start_time = time.time()

print(f"Dataset: {len(twitter_df):,} tweets")
print(f"Grouping by: source ({twitter_df['source'].nunique()} unique accounts)")

# 1. TOP ACCOUNTS BY TOTAL LIKES
print("\n1. TOP 10 ACCOUNTS BY TOTAL LIKES")
print("=" * 60)

likes_by_source = twitter_df.groupby('source')['likeCount'].agg(['count', 'sum', 'mean']).round(0)
likes_by_source.columns = ['tweets', 'total_likes', 'avg_likes']
likes_by_source = likes_by_source.sort_values('total_likes', ascending=False)

print(f"{'Account':<35} {'Tweets':<8} {'Total Likes':<12} {'Avg Likes':<10}")
print("-" * 67)
for source, row in likes_by_source.head(10).iterrows():
    print(f"{source[:32]:<35} {row['tweets']:<8.0f} {row['total_likes']:<12,.0f} {row['avg_likes']:<10.0f}")

# 2. TOP ACCOUNTS BY TOTAL RETWEETS
print("\n2. TOP 10 ACCOUNTS BY TOTAL RETWEETS")
print("=" * 60)

retweets_by_source = twitter_df.groupby('source')['retweetCount'].agg(['count', 'sum', 'mean']).round(0)
retweets_by_source.columns = ['tweets', 'total_retweets', 'avg_retweets']
retweets_by_source = retweets_by_source.sort_values('total_retweets', ascending=False)

print(f"{'Account':<35} {'Tweets':<8} {'Total Retweets':<15} {'Avg Retweets':<12}")
print("-" * 72)
for source, row in retweets_by_source.head(10).iterrows():
    print(f"{source[:32]:<35} {row['tweets']:<8.0f} {row['total_retweets']:<15,.0f} {row['avg_retweets']:<12.0f}")

# 3. TOP ACCOUNTS BY TWEET COUNT
print("\n3. TOP 10 ACCOUNTS BY TWEET COUNT")
print("=" * 60)

tweet_count = twitter_df.groupby('source').size().sort_values(ascending=False)

print(f"{'Account':<35} {'Tweet Count':<12}")
print("-" * 49)
for source, count in tweet_count.head(10).items():
    print(f"{source[:32]:<35} {count:<12}")

# 4. MONTHLY ACTIVITY
print("\n4. TOP 5 MONTHS BY TWEET VOLUME")
print("=" * 60)

monthly_tweets = twitter_df.groupby('month_year').size().sort_values(ascending=False)

print(f"{'Month':<10} {'Tweets':<8}")
print("-" * 20)
for month, count in monthly_tweets.head(5).items():
    print(f"{month:<10} {count:<8}")

# Execution summary
end_time = time.time()
execution_time = end_time - start_time

print(f"\n5. EXECUTION SUMMARY")
print("=" * 60)
print(f"Pandas groupby time: {execution_time:.3f} seconds")
print(f"Accounts analyzed: {twitter_df['source'].nunique()}")
print(f"Total tweets: {len(twitter_df):,}")

print("\n Pandas groupby analysis complete!")

=== TWITTER DATASET - PANDAS GROUPBY ===


Enter path to your Twitter CSV file:  2024_tw_posts_president_scored_anon.csv


 Dataset loaded successfully!
Dataset: 27,304 tweets
Grouping by: source (14 unique accounts)

1. TOP 10 ACCOUNTS BY TOTAL LIKES
Account                             Tweets   Total Likes  Avg Likes 
-------------------------------------------------------------------
Twitter Web App                     14930    99,579,349   6670      
Sprout Social                       2933     53,302,601   18173     
Twitter for iPhone                  8494     31,784,204   3742      
Twitter Media Studio                499      3,691,549    7398      
Periscope                           103      255,786      2483      
TweetDeck Web App                   7        72,523       10360     
Twitter for iPad                    266      59,237       223       
Hootsuite Inc.                      47       17,899       381       
Twitter for Advertisers             7        5,289        756       
Loomly                              10       1,603        160       

2. TOP 10 ACCOUNTS BY TOTAL RETWEETS
Accoun

In [4]:
import polars as pl
import time

# Twitter Dataset - Simple Polars Groupby Analysis
print("=== TWITTER DATASET - POLARS GROUPBY ===")

# Prompt user for CSV file
csv_path = input("Enter path to your Twitter CSV file: ").strip()

# Load the dataset
try:
    twitter_df = pl.read_csv(csv_path)
    print(f" Dataset loaded successfully!")
except Exception as e:
    print(f" Error loading CSV: {e}")
    exit()

start_time = time.time()

print(f"Dataset: {twitter_df.height:,} tweets")
print(f"Grouping by: source ({twitter_df.select('source').n_unique():,} unique accounts)")

# 1. TOP ACCOUNTS BY TOTAL LIKES
print("\n1. TOP 10 ACCOUNTS BY TOTAL LIKES")
print("=" * 60)

likes_by_source = twitter_df.group_by('source').agg([
    pl.len().alias('tweets'),
    pl.col('likeCount').sum().alias('total_likes'),
    pl.col('likeCount').mean().alias('avg_likes')
]).sort('total_likes', descending=True)

print(f"{'Account':<35} {'Tweets':<8} {'Total Likes':<12} {'Avg Likes':<10}")
print("-" * 67)
for i in range(min(10, likes_by_source.height)):
    source = likes_by_source.item(i, 'source')
    tweets = likes_by_source.item(i, 'tweets')
    total_likes = likes_by_source.item(i, 'total_likes')
    avg_likes = likes_by_source.item(i, 'avg_likes')
    print(f"{source[:32]:<35} {tweets:<8} {total_likes:<12,.0f} {avg_likes:<10.0f}")

# 2. TOP ACCOUNTS BY TOTAL RETWEETS
print("\n2. TOP 10 ACCOUNTS BY TOTAL RETWEETS")
print("=" * 60)

retweets_by_source = twitter_df.group_by('source').agg([
    pl.len().alias('tweets'),
    pl.col('retweetCount').sum().alias('total_retweets'),
    pl.col('retweetCount').mean().alias('avg_retweets')
]).sort('total_retweets', descending=True)

print(f"{'Account':<35} {'Tweets':<8} {'Total Retweets':<15} {'Avg Retweets':<12}")
print("-" * 72)
for i in range(min(10, retweets_by_source.height)):
    source = retweets_by_source.item(i, 'source')
    tweets = retweets_by_source.item(i, 'tweets')
    total_retweets = retweets_by_source.item(i, 'total_retweets')
    avg_retweets = retweets_by_source.item(i, 'avg_retweets')
    print(f"{source[:32]:<35} {tweets:<8} {total_retweets:<15,.0f} {avg_retweets:<12.0f}")

# 3. TOP ACCOUNTS BY TWEET COUNT
print("\n3. TOP 10 ACCOUNTS BY TWEET COUNT")
print("=" * 60)

tweet_count = twitter_df.group_by('source').agg([
    pl.len().alias('tweet_count')
]).sort('tweet_count', descending=True)

print(f"{'Account':<35} {'Tweet Count':<12}")
print("-" * 49)
for i in range(min(10, tweet_count.height)):
    source = tweet_count.item(i, 'source')
    count = tweet_count.item(i, 'tweet_count')
    print(f"{source[:32]:<35} {count:<12}")

# 4. MONTHLY ACTIVITY
print("\n4. TOP 5 MONTHS BY TWEET VOLUME")
print("=" * 60)

monthly_tweets = twitter_df.group_by('month_year').agg([
    pl.len().alias('tweets')
]).sort('tweets', descending=True)

print(f"{'Month':<10} {'Tweets':<8}")
print("-" * 20)
for i in range(min(5, monthly_tweets.height)):
    month = monthly_tweets.item(i, 'month_year')
    count = monthly_tweets.item(i, 'tweets')
    print(f"{month:<10} {count:<8}")

# Execution summary
end_time = time.time()
execution_time = end_time - start_time

print(f"\n5. EXECUTION SUMMARY")
print("=" * 60)
print(f"Polars groupby time: {execution_time:.3f} seconds")
print(f"Accounts analyzed: {twitter_df.select('source').n_unique():,}")
print(f"Total tweets: {twitter_df.height:,}")

print("\n Polars groupby analysis complete!")

=== TWITTER DATASET - POLARS GROUPBY ===


Enter path to your Twitter CSV file:  2024_tw_posts_president_scored_anon.csv


 Dataset loaded successfully!
Dataset: 27,304 tweets
Grouping by: source (14 unique accounts)

1. TOP 10 ACCOUNTS BY TOTAL LIKES
Account                             Tweets   Total Likes  Avg Likes 
-------------------------------------------------------------------
Twitter Web App                     14930    99,579,349   6670      
Sprout Social                       2933     53,302,601   18173     
Twitter for iPhone                  8494     31,784,204   3742      
Twitter Media Studio                499      3,691,549    7398      
Periscope                           103      255,786      2483      
TweetDeck Web App                   7        72,523       10360     
Twitter for iPad                    266      59,237       223       
Hootsuite Inc.                      47       17,899       381       
Twitter for Advertisers             7        5,289        756       
Loomly                              10       1,603        160       

2. TOP 10 ACCOUNTS BY TOTAL RETWEETS
Accoun

In [5]:
import csv
import time
from collections import defaultdict

# Twitter Dataset - Simple Pure Python Groupby Analysis
print("=== TWITTER DATASET - PURE PYTHON GROUPBY ===")

# Prompt user for CSV file
csv_path = input("Enter path to your Twitter CSV file: ").strip()

# Load the dataset
try:
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        twitter_data = list(reader)
    print(f"✅ Dataset loaded successfully!")
except Exception as e:
    print(f"❌ Error loading CSV: {e}")
    exit()

start_time = time.time()

print(f"Dataset: {len(twitter_data):,} tweets")

# Manual groupby using dictionaries
source_groups = defaultdict(list)
for tweet in twitter_data:
    source = tweet.get('source', 'Unknown')
    source_groups[source].append(tweet)

print(f"Grouping by: source ({len(source_groups)} unique accounts)")

def safe_int(value):
    """Convert value to int safely"""
    try:
        return int(float(value)) if value else 0
    except:
        return 0

# 1. TOP ACCOUNTS BY TOTAL LIKES
print("\n1. TOP 10 ACCOUNTS BY TOTAL LIKES")
print("=" * 60)

likes_stats = {}
for source, tweets in source_groups.items():
    likes = [safe_int(tweet.get('likeCount', 0)) for tweet in tweets]
    likes_stats[source] = {
        'tweets': len(tweets),
        'total_likes': sum(likes),
        'avg_likes': sum(likes) / len(likes) if likes else 0
    }

# Sort by total likes
sorted_likes = sorted(likes_stats.items(), key=lambda x: x[1]['total_likes'], reverse=True)

print(f"{'Account':<35} {'Tweets':<8} {'Total Likes':<12} {'Avg Likes':<10}")
print("-" * 67)
for source, stats in sorted_likes[:10]:
    print(f"{source[:32]:<35} {stats['tweets']:<8} {stats['total_likes']:<12,.0f} {stats['avg_likes']:<10.0f}")

# 2. TOP ACCOUNTS BY TOTAL RETWEETS
print("\n2. TOP 10 ACCOUNTS BY TOTAL RETWEETS")
print("=" * 60)

retweet_stats = {}
for source, tweets in source_groups.items():
    retweets = [safe_int(tweet.get('retweetCount', 0)) for tweet in tweets]
    retweet_stats[source] = {
        'tweets': len(tweets),
        'total_retweets': sum(retweets),
        'avg_retweets': sum(retweets) / len(retweets) if retweets else 0
    }

# Sort by total retweets
sorted_retweets = sorted(retweet_stats.items(), key=lambda x: x[1]['total_retweets'], reverse=True)

print(f"{'Account':<35} {'Tweets':<8} {'Total Retweets':<15} {'Avg Retweets':<12}")
print("-" * 72)
for source, stats in sorted_retweets[:10]:
    print(f"{source[:32]:<35} {stats['tweets']:<8} {stats['total_retweets']:<15,.0f} {stats['avg_retweets']:<12.0f}")

# 3. TOP ACCOUNTS BY TWEET COUNT
print("\n3. TOP 10 ACCOUNTS BY TWEET COUNT")
print("=" * 60)

# Count tweets per source
tweet_counts = {source: len(tweets) for source, tweets in source_groups.items()}
sorted_counts = sorted(tweet_counts.items(), key=lambda x: x[1], reverse=True)

print(f"{'Account':<35} {'Tweet Count':<12}")
print("-" * 49)
for source, count in sorted_counts[:10]:
    print(f"{source[:32]:<35} {count:<12}")

# 4. MONTHLY ACTIVITY
print("\n4. TOP 5 MONTHS BY TWEET VOLUME")
print("=" * 60)

# Group by month_year
month_groups = defaultdict(int)
for tweet in twitter_data:
    month = tweet.get('month_year', 'Unknown')
    month_groups[month] += 1

# Sort by tweet count
sorted_months = sorted(month_groups.items(), key=lambda x: x[1], reverse=True)

print(f"{'Month':<10} {'Tweets':<8}")
print("-" * 20)
for month, count in sorted_months[:5]:
    print(f"{month:<10} {count:<8}")

# Execution summary
end_time = time.time()
execution_time = end_time - start_time

print(f"\n5. EXECUTION SUMMARY")
print("=" * 60)
print(f"Pure Python groupby time: {execution_time:.3f} seconds")
print(f"Accounts analyzed: {len(source_groups)}")
print(f"Total tweets: {len(twitter_data):,}")

print("\n✅ Pure Python groupby analysis complete!")

=== TWITTER DATASET - PURE PYTHON GROUPBY ===


Enter path to your Twitter CSV file:  2024_tw_posts_president_scored_anon.csv


✅ Dataset loaded successfully!
Dataset: 27,304 tweets
Grouping by: source (14 unique accounts)

1. TOP 10 ACCOUNTS BY TOTAL LIKES
Account                             Tweets   Total Likes  Avg Likes 
-------------------------------------------------------------------
Twitter Web App                     14930    99,579,349   6670      
Sprout Social                       2933     53,302,601   18173     
Twitter for iPhone                  8494     31,784,204   3742      
Twitter Media Studio                499      3,691,549    7398      
Periscope                           103      255,786      2483      
TweetDeck Web App                   7        72,523       10360     
Twitter for iPad                    266      59,237       223       
Hootsuite Inc.                      47       17,899       381       
Twitter for Advertisers             7        5,289        756       
Loomly                              10       1,603        160       

2. TOP 10 ACCOUNTS BY TOTAL RETWEETS
Accou