In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import folium
from folium.plugins import HeatMap, TimestampedGeoJson
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from wordcloud import WordCloud
import geocoder
from datetime import datetime
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

# Download necessary nltk resources
nltk.download('punkt')
nltk.download('stopwords')

# Load datasets
tweets_df = pd.read_csv('tweets.csv')
users_df = pd.read_csv('users.csv')

# Display basic information about the datasets
print("Tweets Dataset:")
print(f"Shape: {tweets_df.shape}")
print("\nUsers Dataset:")
print(f"Shape: {users_df.shape}")

# Step 1: Sort data based on timestamps
tweets_df['timestamp'] = pd.to_datetime(tweets_df['timestamp'])
tweets_df = tweets_df.sort_values(by='timestamp')

# Step 2: Detect languages using TextBlob and add a language code column
def detect_language(text):
    if isinstance(text, str):
        try:
            return TextBlob(text).detect_language()
        except:
            return 'en'  # Default to English if detection fails
    return 'en'  # Default for non-string values

tweets_df['language_code'] = tweets_df['text'].apply(detect_language)

# Step 3: Translate non-English text to English
def translate_to_english(row):
    if row['language_code'] != 'en' and isinstance(row['text'], str):
        try:
            return str(TextBlob(row['text']).translate(to='en'))
        except:
            return row['text']  # Return original if translation fails
    return row['text']  # Return original for English or non-string values

tweets_df['translated_text'] = tweets_df.apply(translate_to_english, axis=1)

# Use translated_text for further analysis
analysis_text = tweets_df['translated_text'].fillna('')

# Step 4: Extract keywords using TF-IDF
stop_words = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(max_features=5, stop_words=stop_words)
tfidf_matrix = vectorizer.fit_transform(analysis_text)
feature_names = vectorizer.get_feature_names_out()

# Create a column with top keywords for each tweet
def get_top_keywords(tfidf_row, feature_names):
    indices = np.argsort(tfidf_row)[::-1]
    top_keywords = [(feature_names[i], tfidf_row[i]) for i in indices if tfidf_row[i] > 0]
    return top_keywords

tweets_df['keywords'] = [get_top_keywords(tfidf_matrix[i].toarray()[0], feature_names) 
                        for i in range(tfidf_matrix.shape[0])]

# Step 5: Extract sentiment for each keyword and the entire sentence
def get_sentiment(text):
    if isinstance(text, str) and text.strip():
        return TextBlob(text).sentiment.polarity
    return 0

# Sentiment for each keyword and entire sentence
tweets_df['keyword_sentiments'] = tweets_df['keywords'].apply(
    lambda kw_list: [(kw, score, get_sentiment(kw)) for kw, score in kw_list] if isinstance(kw_list, list) else []
)
tweets_df['sentence_sentiment'] = tweets_df['translated_text'].apply(get_sentiment)

# Step 6: Extract geographic places and get coordinates
def extract_locations(text):
    if not isinstance(text, str):
        return []
    
    # Simple location patterns (can be enhanced)
    locations = []
    location_patterns = [
        r'\b(?:in|at|from)\s+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)',  # Places after prepositions
        r'\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\b'  # Capitalized names
    ]
    
    for pattern in location_patterns:
        matches = re.findall(pattern, text)
        locations.extend(matches)
    
    # Filter common non-location capitalized words
    common_words = {'The', 'A', 'An', 'I', 'Trump', 'Putin', 'America', 'USA', 'Russia', 'Ukraine', 'WW2', 'WW3'}
    locations = [loc for loc in locations if loc not in common_words]
    
    return list(set(locations))

def get_coordinates(location):
    try:
        g = geocoder.osm(location)
        if g.ok:
            return (g.lat, g.lng, location)
        return None
    except:
        return None

tweets_df['extracted_locations'] = tweets_df['translated_text'].apply(extract_locations)
tweets_df['location_coordinates'] = tweets_df['extracted_locations'].apply(
    lambda locs: [get_coordinates(loc) for loc in locs if loc]
)
tweets_df['location_coordinates'] = tweets_df['location_coordinates'].apply(
    lambda coords: [c for c in coords if c is not None]
)

# Create folium map
def create_map(df):
    m = folium.Map(location=[20, 0], zoom_start=2)
    
    # Add markers for each tweet with location
    for idx, row in df.iterrows():
        for lat, lon, loc_name in row['location_coordinates']:
            popup_text = f"User: {row['username']}<br>Tweet: {row['text'][:100]}...<br>Location: {loc_name}"
            folium.Marker(
                location=[lat, lon],
                popup=popup_text,
                icon=folium.Icon(color='blue')
            ).add_to(m)
    
    # Save the map
    m.save('tweet_locations.html')
    
    return m

# Create time-series map
def create_time_series_map(df):
    m = folium.Map(location=[20, 0], zoom_start=2)
    
    # Prepare data for TimestampedGeoJson
    features = []
    for idx, row in df.iterrows():
        for lat, lon, loc_name in row['location_coordinates']:
            time_str = row['timestamp'].strftime('%Y-%m-%d %H:%M:%S')
            feature = {
                'type': 'Feature',
                'geometry': {
                    'type': 'Point',
                    'coordinates': [lon, lat]
                },
                'properties': {
                    'time': time_str,
                    'popup': f"User: {row['username']}<br>Tweet: {row['text'][:100]}...<br>Location: {loc_name}"
                }
            }
            features.append(feature)
    
    # Add TimestampedGeoJson to map
    if features:
        TimestampedGeoJson(
            {'type': 'FeatureCollection', 'features': features},
            period='PT1H',
            add_last_point=True,
            auto_play=True,
            loop=False,
            max_speed=1,
            loop_button=True,
            time_slider_drag_update=True
        ).add_to(m)
    
    # Save the map
    m.save('tweet_time_series.html')
    
    return m

# Step 7: Extract mentioned users (@username)
def extract_mentions(text):
    if not isinstance(text, str):
        return []
    
    mentions = re.findall(r'@(\w+)', text)
    return list(set(mentions))

tweets_df['mentioned_users'] = tweets_df['text'].apply(extract_mentions)

# Step 8: Assess political inclination
def assess_political_inclination(text):
    if not isinstance(text, str):
        return 'neutral'
    
    # Simple keyword-based approach
    left_keywords = ['democrat', 'liberal', 'progressive', 'left', 'biden', 'harris']
    right_keywords = ['republican', 'conservative', 'maga', 'trump', 'right']
    
    text_lower = text.lower()
    left_count = sum(1 for kw in left_keywords if kw in text_lower)
    right_count = sum(1 for kw in right_keywords if kw in text_lower)
    
    if left_count > right_count:
        return 'left-leaning'
    elif right_count > left_count:
        return 'right-leaning'
    else:
        return 'neutral'

tweets_df['political_inclination'] = tweets_df['translated_text'].apply(assess_political_inclination)

# Step 9: Create word cloud and co-occurrence heatmap
def create_wordcloud(keywords):
    # Extract words from keyword tuples and create frequency dict
    word_freq = {}
    for tweet_keywords in keywords:
        for word, score in tweet_keywords:
            if word in word_freq:
                word_freq[word] += score
            else:
                word_freq[word] = score
    
    # Generate word cloud
    if word_freq:
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
        
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig('wordcloud.png')
        plt.close()

def create_cooccurrence_heatmap(df):
    # Extract all unique keywords
    all_keywords = set()
    for kw_list in df['keywords']:
        all_keywords.update([kw for kw, _ in kw_list])
    
    all_keywords = list(all_keywords)
    
    # Create co-occurrence matrix
    cooccurrence = np.zeros((len(all_keywords), len(all_keywords)))
    
    for kw_list in df['keywords']:
        words = [kw for kw, _ in kw_list]
        for i, word1 in enumerate(all_keywords):
            for j, word2 in enumerate(all_keywords):
                if word1 in words and word2 in words and i != j:
                    cooccurrence[i, j] += 1
    
    # Plot heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(cooccurrence, xticklabels=all_keywords, yticklabels=all_keywords, cmap='Blues')
    plt.title('Keyword Co-occurrence Heatmap')
    plt.tight_layout()
    plt.savefig('cooccurrence_heatmap.png')
    plt.close()

# Step 10: Time series analysis
def create_sentiment_time_series(df):
    # Resample to hourly data
    df_time = df.set_index('timestamp')
    hourly_sentiment = df_time['sentence_sentiment'].resample('H').mean()
    
    # Plot time series
    plt.figure(figsize=(12, 6))
    hourly_sentiment.plot()
    plt.title('Hourly Tweet Sentiment')
    plt.xlabel('Time')
    plt.ylabel('Average Sentiment Score')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('sentiment_time_series.png')
    plt.close()
    
    return hourly_sentiment

# Step 11: Topic modeling
def perform_topic_modeling(texts, n_topics=5):
    # Vectorize text
    count_vect = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = count_vect.fit_transform(texts)
    
    # LDA model
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(doc_term_matrix)
    
    # Get top words for each topic
    feature_names = count_vect.get_feature_names_out()
    topics = []
    
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[:-10 - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append((f"Topic {topic_idx+1}", top_words))
    
    return topics

# Step 12: Create additional visualizations
def create_top_visualizations(df):
    # Top languages
    plt.figure(figsize=(10, 6))
    df['language_code'].value_counts().plot(kind='bar')
    plt.title('Top Languages')
    plt.xlabel('Language Code')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig('top_languages.png')
    plt.close()
    
    # Top mentioned users
    all_mentions = []
    for mentions in df['mentioned_users']:
        all_mentions.extend(mentions)
    
    if all_mentions:
        mention_counts = pd.Series(all_mentions).value_counts().head(10)
        
        plt.figure(figsize=(10, 6))
        mention_counts.plot(kind='bar')
        plt.title('Top Mentioned Users')
        plt.xlabel('Username')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig('top_mentions.png')
        plt.close()
    
    # Political inclination distribution
    plt.figure(figsize=(10, 6))
    df['political_inclination'].value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title('Political Inclination Distribution')
    plt.tight_layout()
    plt.savefig('political_distribution.png')
    plt.close()
    
    # Sentiment vs Political inclination scatter plot
    plt.figure(figsize=(10, 6))
    colors = {'left-leaning': 'blue', 'right-leaning': 'red', 'neutral': 'green'}
    for inclination in colors:
        subset = df[df['political_inclination'] == inclination]
        plt.scatter(
            subset.index, 
            subset['sentence_sentiment'],
            c=colors[inclination],
            label=inclination,
            alpha=0.6
        )
    
    plt.title('Sentiment vs Political Inclination')
    plt.xlabel('Tweet Index (Time Ordered)')
    plt.ylabel('Sentiment Score')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('sentiment_vs_politics.png')
    plt.close()

# Execute all analysis functions
print("\nCreating wordcloud...")
create_wordcloud(tweets_df['keywords'])

print("Creating co-occurrence heatmap...")
create_cooccurrence_heatmap(tweets_df)

print("Creating maps...")
create_map(tweets_df)
create_time_series_map(tweets_df)

print("Creating time series analysis...")
sentiment_ts = create_sentiment_time_series(tweets_df)

print("Performing topic modeling...")
topics = perform_topic_modeling(tweets_df['translated_text'].fillna(''), n_topics=5)
for topic_name, top_words in topics:
    print(f"{topic_name}: {', '.join(top_words)}")

print("Creating additional visualizations...")
create_top_visualizations(tweets_df)

# Generate comprehensive report
print("\nGenerating AI summary report...")

# Calculate some statistics for the report
total_tweets = len(tweets_df)
unique_users = tweets_df['username'].nunique()
avg_sentiment = tweets_df['sentence_sentiment'].mean()
political_counts = tweets_df['political_inclination'].value_counts()

# Generate a report summary
report = f"""
# Twitter Data Analysis Report

## Overview
- Total Tweets Analyzed: {total_tweets}
- Unique Users: {unique_users}
- Average Sentiment Score: {avg_sentiment:.2f}

## Political Distribution
{political_counts.to_string()}

## Top Topics
"""

for topic_name, top_words in topics:
    report += f"- {topic_name}: {', '.join(top_words)}\n"

report += """
## Key Findings
- The dataset primarily contains tweets related to World War discussions, with many comparing WW2 and potential WW3 scenarios.
- There's significant political polarization in discussions about Ukraine and Russia.
- Multiple languages were detected, primarily English with some non-English content.
- Sentiment analysis shows varying emotional responses to geopolitical topics.
- Geographic analysis reveals global interest in these topics.

## Generated Visualizations
- Wordcloud of key terms
- Co-occurrence heatmap showing related terms
- Interactive map of tweet locations
- Time series of tweet sentiments
- Political inclination distribution
- Sentiment vs politics scatter plot
"""

# Save the report
with open('twitter_analysis_report.md', 'w') as f:
    f.write(report)

print("Analysis complete! All visualizations and the report have been saved.")

# Create final DataFrame with all analysis
final_df = tweets_df[['username', 'text', 'timestamp', 'language_code', 'translated_text', 
                      'keywords', 'sentence_sentiment', 'extracted_locations', 
                      'mentioned_users', 'political_inclination']]

# Save the enriched dataframe
final_df.to_csv('analyzed_tweets.csv', index=False)

print("\nEnriched dataset saved to 'analyzed_tweets.csv'")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tweets Dataset:
Shape: (33, 9)

Users Dataset:
Shape: (27, 6)


InvalidParameterError: The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'can', 'until', "wouldn't", 'being', "i'd", 'they', 'mightn', 'these', "they're", "you're", 'me', 'herself', 'to', 'you', 'whom', 'themselves', 'after', "we're", 'each', 'him', 'his', "mightn't", 'once', "they've", 'did', "don't", "i'll", 'i', 'no', "that'll", 'here', 'mustn', 'in', 'who', 'through', 'against', 'your', 'other', 'didn', 'during', 'how', 'there', 'such', 'their', 'have', 'off', "isn't", 'shouldn', 'but', 'd', 'haven', "should've", "you'll", "it'll", "shan't", 'been', 'hadn', 'that', 'all', 'had', 'more', 've', 's', 'doing', 'above', 'theirs', 'which', "wasn't", 'now', 'be', 'below', 'hers', "he'll", 'most', "didn't", "i'm", "hasn't", 'she', 'about', 'wouldn', 'he', 'is', "i've", 'll', 'aren', 'because', 'on', 'then', "couldn't", 'wasn', 'needn', "needn't", 'few', 'under', 'between', 'over', 'doesn', 'out', 'isn', 'where', 'hasn', 'weren', 'down', 'her', 'ain', 'yours', 'my', "mustn't", "she'll", 'those', "doesn't", 'from', 'y', 'only', 'both', 'ourselves', 'for', "it'd", 'nor', 'than', "you've", 'himself', 'couldn', "they'd", "we'll", 'further', 'any', 'an', 'very', 'or', 'not', "it's", 'were', 'what', 'again', 'so', 'has', "she'd", "won't", 'should', 't', 'this', "hadn't", 'shan', 'just', 'myself', 'ma', 'when', 'into', 'won', 'it', 'at', 'the', 'was', 'too', 'o', 'a', 'same', 're', 'them', "we've", 'does', 'our', 'am', 'if', 'having', "they'll", 'and', 'ours', 'own', "shouldn't", 'its', 'don', "haven't", 'will', "you'd", 'itself', 'yourselves', 'are', "he'd", "he's", 'some', 'by', 'while', 'yourself', 'we', 'as', "weren't", 'of', "we'd", 'with', 'up', 'do', "she's", "aren't", 'before', 'why', 'm'} instead.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import folium
from folium.plugins import TimestampedGeoJson
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
from wordcloud import WordCloud
from datetime import datetime
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

# Import and load spaCy and geopy
import spacy
from geopy.geocoders import Nominatim

# Download necessary nltk resources
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Initialize geopy Nominatim geocoder
geolocator = Nominatim(user_agent="tweet_geocoder")

# Load datasets
tweets_df = pd.read_csv('tweets.csv')
users_df = pd.read_csv('users.csv')

# Display basic dataset info
print("Tweets Dataset:")
print(f"Shape: {tweets_df.shape}")
print("\nUsers Dataset:")
print(f"Shape: {users_df.shape}")

# Step 1: Sort data based on timestamps
tweets_df['timestamp'] = pd.to_datetime(tweets_df['timestamp'])
tweets_df = tweets_df.sort_values(by='timestamp')

# Step 2: Detect language using TextBlob and add language code column
def detect_language(text):
    if isinstance(text, str):
        try:
            return TextBlob(text).detect_language()
        except Exception:
            return 'en'  # default to English if detection fails
    return 'en'

tweets_df['language_code'] = tweets_df['text'].apply(detect_language)

# Step 3: Translate non-English text to English using TextBlob
def translate_to_english(row):
    if row['language_code'] != 'en' and isinstance(row['text'], str):
        try:
            return str(TextBlob(row['text']).translate(to='en'))
        except Exception:
            return row['text']
    return row['text']

tweets_df['translated_text'] = tweets_df.apply(translate_to_english, axis=1)

# Use translated_text for further analysis (fill missing values with an empty string)
analysis_text = tweets_df['translated_text'].fillna('')

# Step 4: Extract keywords using TF-IDF
# Convert stopwords (originally a set) into a list to avoid InvalidParameterError.
stop_words = list(stopwords.words('english'))
vectorizer = TfidfVectorizer(max_features=5, stop_words=stop_words)
tfidf_matrix = vectorizer.fit_transform(analysis_text)
feature_names = vectorizer.get_feature_names_out()

def get_top_keywords(tfidf_row, feature_names):
    indices = np.argsort(tfidf_row)[::-1]
    top_keywords = [(feature_names[i], tfidf_row[i]) for i in indices if tfidf_row[i] > 0]
    return top_keywords

tweets_df['keywords'] = [get_top_keywords(tfidf_matrix[i].toarray()[0], feature_names) 
                           for i in range(tfidf_matrix.shape[0])]

# Step 5: Compute sentiment scores for each tweet and for each extracted keyword
def get_sentiment(text):
    if isinstance(text, str) and text.strip():
        return TextBlob(text).sentiment.polarity
    return 0

tweets_df['keyword_sentiments'] = tweets_df['keywords'].apply(
    lambda kw_list: [(kw, score, get_sentiment(kw)) for kw, score in kw_list] if isinstance(kw_list, list) else []
)
tweets_df['sentence_sentiment'] = tweets_df['translated_text'].apply(get_sentiment)

# Step 6: Extract geographic locations using spaCy NER and get coordinates using geopy
def extract_locations(text):
    """Use spaCy's NER to extract locations (GPE or LOC)."""
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
    return list(set(locations))  # unique locations

def get_coordinates(location):
    try:
        loc = geolocator.geocode(location)
        if loc:
            return (loc.latitude, loc.longitude, location)
        return None
    except Exception:
        return None

tweets_df['extracted_locations'] = tweets_df['translated_text'].apply(extract_locations)
tweets_df['location_coordinates'] = tweets_df['extracted_locations'].apply(
    lambda locs: [get_coordinates(loc) for loc in locs if loc]
)
tweets_df['location_coordinates'] = tweets_df['location_coordinates'].apply(
    lambda coords: [c for c in coords if c is not None]
)

# Create a folium map with tweet locations
def create_map(df):
    m = folium.Map(location=[20, 0], zoom_start=2)
    for idx, row in df.iterrows():
        for coordinate in row['location_coordinates']:
            if coordinate:
                lat, lon, loc_name = coordinate
                popup_text = f"User: {row['username']}<br>Tweet: {row['text'][:100]}...<br>Location: {loc_name}"
                folium.Marker(
                    location=[lat, lon],
                    popup=popup_text,
                    icon=folium.Icon(color='blue')
                ).add_to(m)
    m.save('tweet_locations.html')
    return m

# Create a time-series folium map with timestamped tweet locations
def create_time_series_map(df):
    m = folium.Map(location=[20, 0], zoom_start=2)
    features = []
    for idx, row in df.iterrows():
        for coordinate in row['location_coordinates']:
            if coordinate:
                lat, lon, loc_name = coordinate
                time_str = row['timestamp'].strftime('%Y-%m-%d %H:%M:%S')
                feature = {
                    'type': 'Feature',
                    'geometry': {
                        'type': 'Point',
                        'coordinates': [lon, lat]
                    },
                    'properties': {
                        'time': time_str,
                        'popup': f"User: {row['username']}<br>Tweet: {row['text'][:100]}...<br>Location: {loc_name}"
                    }
                }
                features.append(feature)
    if features:
        TimestampedGeoJson(
            {'type': 'FeatureCollection', 'features': features},
            period='PT1H',
            add_last_point=True,
            auto_play=True,
            loop=False,
            max_speed=1,
            loop_button=True,
            time_slider_drag_update=True
        ).add_to(m)
    m.save('tweet_time_series.html')
    return m

# Step 7: Extract mentioned users using regex
def extract_mentions(text):
    if not isinstance(text, str):
        return []
    mentions = re.findall(r'@(\w+)', text)
    return list(set(mentions))

tweets_df['mentioned_users'] = tweets_df['text'].apply(extract_mentions)

# Step 8: Assess political inclination via a keyword-based approach
def assess_political_inclination(text):
    if not isinstance(text, str):
        return 'neutral'
    left_keywords = ['democrat', 'liberal', 'progressive', 'left', 'biden', 'harris']
    right_keywords = ['republican', 'conservative', 'maga', 'trump', 'right']
    text_lower = text.lower()
    left_count = sum(1 for kw in left_keywords if kw in text_lower)
    right_count = sum(1 for kw in right_keywords if kw in text_lower)
    if left_count > right_count:
        return 'left-leaning'
    elif right_count > left_count:
        return 'right-leaning'
    else:
        return 'neutral'

tweets_df['political_inclination'] = tweets_df['translated_text'].apply(assess_political_inclination)

# Step 9: Create a word cloud and a keyword co-occurrence heatmap
def create_wordcloud(keywords):
    word_freq = {}
    for tweet_keywords in keywords:
        for word, score in tweet_keywords:
            word_freq[word] = word_freq.get(word, 0) + score
    if word_freq:
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig('wordcloud.png')
        plt.close()

def create_cooccurrence_heatmap(df):
    all_keywords = set()
    for kw_list in df['keywords']:
        all_keywords.update([kw for kw, _ in kw_list])
    all_keywords = list(all_keywords)
    
    cooccurrence = np.zeros((len(all_keywords), len(all_keywords)))
    for kw_list in df['keywords']:
        words = [kw for kw, _ in kw_list]
        for i, word1 in enumerate(all_keywords):
            for j, word2 in enumerate(all_keywords):
                if word1 in words and word2 in words and i != j:
                    cooccurrence[i, j] += 1
                    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cooccurrence, xticklabels=all_keywords, yticklabels=all_keywords, cmap='Blues')
    plt.title('Keyword Co-occurrence Heatmap')
    plt.tight_layout()
    plt.savefig('cooccurrence_heatmap.png')
    plt.close()

# Step 10: Time series analysis of tweet sentiment
def create_sentiment_time_series(df):
    df_time = df.set_index('timestamp')
    hourly_sentiment = df_time['sentence_sentiment'].resample('H').mean()
    
    plt.figure(figsize=(12, 6))
    hourly_sentiment.plot()
    plt.title('Hourly Tweet Sentiment')
    plt.xlabel('Time')
    plt.ylabel('Average Sentiment Score')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('sentiment_time_series.png')
    plt.close()
    
    return hourly_sentiment

# Step 11: Perform topic modeling using LDA
def perform_topic_modeling(texts, n_topics=5):
    count_vect = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = count_vect.fit_transform(texts)
    
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(doc_term_matrix)
    
    feature_names = count_vect.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[:-11:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append((f"Topic {topic_idx+1}", top_words))
    return topics

# Step 12: Additional visualizations (top languages, top mentions, political distribution, sentiment vs. politics)
def create_top_visualizations(df):
    # Top languages
    plt.figure(figsize=(10, 6))
    df['language_code'].value_counts().plot(kind='bar')
    plt.title('Top Languages')
    plt.xlabel('Language Code')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig('top_languages.png')
    plt.close()
    
    # Top mentioned users
    all_mentions = []
    for mentions in df['mentioned_users']:
        all_mentions.extend(mentions)
    
    if all_mentions:
        mention_counts = pd.Series(all_mentions).value_counts().head(10)
        plt.figure(figsize=(10, 6))
        mention_counts.plot(kind='bar')
        plt.title('Top Mentioned Users')
        plt.xlabel('Username')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig('top_mentions.png')
        plt.close()
    
    # Political inclination distribution (pie chart)
    plt.figure(figsize=(10, 6))
    df['political_inclination'].value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title('Political Inclination Distribution')
    plt.tight_layout()
    plt.savefig('political_distribution.png')
    plt.close()
    
    # Scatter plot: Sentiment vs. Political inclination
    plt.figure(figsize=(10, 6))
    colors = {'left-leaning': 'blue', 'right-leaning': 'red', 'neutral': 'green'}
    for inclination, color in colors.items():
        subset = df[df['political_inclination'] == inclination]
        plt.scatter(
            subset.index, 
            subset['sentence_sentiment'],
            c=color,
            label=inclination,
            alpha=0.6
        )
    plt.title('Sentiment vs Political Inclination')
    plt.xlabel('Tweet Index (Time Ordered)')
    plt.ylabel('Sentiment Score')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('sentiment_vs_politics.png')
    plt.close()

# Run all analysis functions
print("\nCreating wordcloud...")
create_wordcloud(tweets_df['keywords'])

print("Creating co-occurrence heatmap...")
create_cooccurrence_heatmap(tweets_df)

print("Creating maps...")
create_map(tweets_df)
create_time_series_map(tweets_df)

print("Creating time series analysis...")
sentiment_ts = create_sentiment_time_series(tweets_df)

print("Performing topic modeling...")
topics = perform_topic_modeling(tweets_df['translated_text'].fillna(''), n_topics=5)
for topic_name, top_words in topics:
    print(f"{topic_name}: {', '.join(top_words)}")

print("Creating additional visualizations...")
create_top_visualizations(tweets_df)

# Generate comprehensive report
print("\nGenerating AI summary report...")

total_tweets = len(tweets_df)
unique_users = tweets_df['username'].nunique()
avg_sentiment = tweets_df['sentence_sentiment'].mean()
political_counts = tweets_df['political_inclination'].value_counts()

report = f"""
# Twitter Data Analysis Report

## Overview
- Total Tweets Analyzed: {total_tweets}
- Unique Users: {unique_users}
- Average Sentiment Score: {avg_sentiment:.2f}

## Political Distribution
{political_counts.to_string()}

## Top Topics
"""

for topic_name, top_words in topics:
    report += f"- {topic_name}: {', '.join(top_words)}\n"

report += """
## Key Findings
- The dataset contains a mix of languages, with non-English tweets translated to English for uniform analysis.
- SpaCy’s NER was used to extract geographic locations, which were then geocoded using geopy.
- Sentiment analysis shows varied emotional responses across tweets.
- Topic modeling using LDA identified distinct themes within the data.

## Generated Visualizations
- Wordcloud of key terms
- Co-occurrence heatmap of keywords
- Interactive maps of tweet locations (static and time series)
- Time series of tweet sentiments
- Political inclination distribution and sentiment vs. politics scatter plot
"""

with open('twitter_analysis_report.md', 'w') as f:
    f.write(report)

print("Analysis complete! All visualizations and the report have been saved.")

# Create a final DataFrame with enriched analysis columns
final_df = tweets_df[['username', 'text', 'timestamp', 'language_code', 'translated_text', 
                        'keywords', 'sentence_sentiment', 'extracted_locations', 
                        'mentioned_users', 'political_inclination']]

final_df.to_csv('analyzed_tweets.csv', index=False)
print("\nEnriched dataset saved to 'analyzed_tweets.csv'")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tweets Dataset:
Shape: (33, 9)

Users Dataset:
Shape: (27, 6)

Creating wordcloud...
Creating co-occurrence heatmap...
Creating maps...
Creating time series analysis...
Performing topic modeling...
Topic 1: ww2, churchill, white, military, winston, did, people, generation, allied, ussr
Topic 2: ww3, right, wants, war, trump, think, going, european, russia, europe
Topic 3: world, ww2, like, fight, war, long, ussr, allied, generation, think
Topic 4: europe, ww3, need, ww2, don, america, weapons, ukraine, fighting, won
Topic 5: ww2, war, did, fought, nation, hasn, want, order, start, allies
Creating additional visualizations...

Generating AI summary report...
Analysis complete! All visualizations and the report have been saved.

Enriched dataset saved to 'analyzed_tweets.csv'


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import folium
from folium.plugins import TimestampedGeoJson
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
from wordcloud import WordCloud
from datetime import datetime
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

# Import language detection and translation libraries
from langdetect import detect
from googletrans import Translator

# Import spaCy and geopy for location extraction and geocoding
import spacy
from geopy.geocoders import Nominatim

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Initialize geopy Nominatim geocoder and googletrans translator
geolocator = Nominatim(user_agent="tweet_geocoder")
translator = Translator()

# Load datasets
tweets_df = pd.read_csv('tweets.csv')
users_df = pd.read_csv('users.csv')

# Display basic dataset info
print("Tweets Dataset:")
print(f"Shape: {tweets_df.shape}")
print("\nUsers Dataset:")
print(f"Shape: {users_df.shape}")

# Step 1: Sort tweets by timestamp
tweets_df['timestamp'] = pd.to_datetime(tweets_df['timestamp'])
tweets_df = tweets_df.sort_values(by='timestamp')

# Step 2: Detect language using langdetect
def detect_language(text):
    if isinstance(text, str) and text.strip():
        try:
            return detect(text)
        except Exception:
            return 'en'
    return 'en'

tweets_df['language_code'] = tweets_df['text'].apply(detect_language)

# Step 3: Translate non-English text to English using googletrans
def translate_to_english(row):
    if row['language_code'] != 'en' and isinstance(row['text'], str):
        try:
            translation = translator.translate(row['text'], dest='en')
            return translation.text
        except Exception:
            return row['text']
    return row['text']

tweets_df['translated_text'] = tweets_df.apply(translate_to_english, axis=1)
analysis_text = tweets_df['translated_text'].fillna('')

# Step 4: Extract keywords using TF-IDF
# Convert NLTK stopwords (set) to a list
stop_words = list(stopwords.words('english'))
vectorizer = TfidfVectorizer(max_features=5, stop_words=stop_words)
tfidf_matrix = vectorizer.fit_transform(analysis_text)
feature_names = vectorizer.get_feature_names_out()

def get_top_keywords(tfidf_row, feature_names):
    indices = np.argsort(tfidf_row)[::-1]
    top_keywords = [(feature_names[i], tfidf_row[i]) for i in indices if tfidf_row[i] > 0]
    return top_keywords

tweets_df['keywords'] = [get_top_keywords(tfidf_matrix[i].toarray()[0], feature_names) 
                           for i in range(tfidf_matrix.shape[0])]

# Step 5: Compute sentiment scores (for each tweet and for keywords)
def get_sentiment(text):
    if isinstance(text, str) and text.strip():
        return TextBlob(text).sentiment.polarity
    return 0

tweets_df['keyword_sentiments'] = tweets_df['keywords'].apply(
    lambda kw_list: [(kw, score, get_sentiment(kw)) for kw, score in kw_list] if isinstance(kw_list, list) else []
)
tweets_df['sentence_sentiment'] = tweets_df['translated_text'].apply(get_sentiment)

# Step 6: Extract geographic locations using spaCy NER and geocode them
def extract_locations(text):
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    # Extract entities labeled as GPE (Countries, cities) or LOC
    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
    return list(set(locations))  # Return unique locations

def get_coordinates(location):
    try:
        loc = geolocator.geocode(location)
        if loc:
            return (loc.latitude, loc.longitude, location)
        return None
    except Exception:
        return None

tweets_df['extracted_locations'] = tweets_df['translated_text'].apply(extract_locations)
tweets_df['location_coordinates'] = tweets_df['extracted_locations'].apply(
    lambda locs: [get_coordinates(loc) for loc in locs if loc]
)
tweets_df['location_coordinates'] = tweets_df['location_coordinates'].apply(
    lambda coords: [c for c in coords if c is not None]
)

# Step 7: Extract mentioned users using regex
def extract_mentions(text):
    if not isinstance(text, str):
        return []
    mentions = re.findall(r'@(\w+)', text)
    return list(set(mentions))

tweets_df['mentioned_users'] = tweets_df['text'].apply(extract_mentions)

# Step 8: Assess political inclination via a keyword-based approach
def assess_political_inclination(text):
    if not isinstance(text, str):
        return 'neutral'
    left_keywords = ['democrat', 'liberal', 'progressive', 'left', 'biden', 'harris']
    right_keywords = ['republican', 'conservative', 'maga', 'trump', 'right']
    text_lower = text.lower()
    left_count = sum(1 for kw in left_keywords if kw in text_lower)
    right_count = sum(1 for kw in right_keywords if kw in text_lower)
    if left_count > right_count:
        return 'left-leaning'
    elif right_count > left_count:
        return 'right-leaning'
    else:
        return 'neutral'

tweets_df['political_inclination'] = tweets_df['translated_text'].apply(assess_political_inclination)

# Step 9: Create wordcloud and co-occurrence heatmap of keywords
def create_wordcloud(keywords):
    word_freq = {}
    for tweet_keywords in keywords:
        for word, score in tweet_keywords:
            word_freq[word] = word_freq.get(word, 0) + score
    if word_freq:
        wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
        plt.figure(figsize=(10, 5))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig('wordcloud.png')
        plt.close()

def create_cooccurrence_heatmap(df):
    all_keywords = set()
    for kw_list in df['keywords']:
        all_keywords.update([kw for kw, _ in kw_list])
    all_keywords = list(all_keywords)
    
    cooccurrence = np.zeros((len(all_keywords), len(all_keywords)))
    for kw_list in df['keywords']:
        words = [kw for kw, _ in kw_list]
        for i, word1 in enumerate(all_keywords):
            for j, word2 in enumerate(all_keywords):
                if word1 in words and word2 in words and i != j:
                    cooccurrence[i, j] += 1
    plt.figure(figsize=(10, 8))
    sns.heatmap(cooccurrence, xticklabels=all_keywords, yticklabels=all_keywords, cmap='Blues')
    plt.title('Keyword Co-occurrence Heatmap')
    plt.tight_layout()
    plt.savefig('cooccurrence_heatmap.png')
    plt.close()

# Step 10: Create a linear regression trend plot for tweet sentiment over a numeric sequence
def plot_trend_line(df, column, title, filename):
    # Create a numeric sequence based on the tweet order (index)
    X = np.arange(len(df)).reshape(-1, 1)
    Y = df[column].values.reshape(-1, 1)
    model = LinearRegression()
    model.fit(X, Y)
    trend = model.predict(X)
    plt.figure(figsize=(10, 6))
    plt.scatter(X, Y, label='Data', alpha=0.5)
    plt.plot(X, trend, color='red', label='Trend line')
    plt.title(title)
    plt.xlabel("Tweet Index (Numeric Sequence)")
    plt.ylabel(column)
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Step 11: Plot time series for number of tweets (daily count) with linear regression trend
def plot_tweet_count_trend(df):
    df_copy = df.copy()
    df_copy['date'] = df_copy['timestamp'].dt.date
    tweet_count = df_copy.groupby('date').size().reset_index(name='count')
    X = np.arange(len(tweet_count)).reshape(-1, 1)
    Y = tweet_count['count'].values.reshape(-1, 1)
    model = LinearRegression()
    model.fit(X, Y)
    trend = model.predict(X)
    plt.figure(figsize=(10, 6))
    plt.scatter(tweet_count['date'], Y, label='Daily Tweet Count', alpha=0.5)
    plt.plot(tweet_count['date'], trend, color='red', label='Trend Line')
    plt.title('Daily Tweet Count with Trend Line')
    plt.xlabel('Date')
    plt.ylabel('Tweet Count')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('daily_tweet_count_trend.png')
    plt.close()

# Step 12: Create maps
# Map One: Map of user locations (using users_df and their "location" field)
def create_user_location_map(users_df):
    m = folium.Map(location=[20, 0], zoom_start=2)
    # Assumes users_df contains columns 'username' and 'location'
    for idx, row in users_df.iterrows():
        if pd.notnull(row.get('location')):
            try:
                loc = geolocator.geocode(row['location'])
                if loc:
                    folium.Marker(
                        location=[loc.latitude, loc.longitude],
                        popup=f"User: {row['username']}<br>Location: {row['location']}",
                        icon=folium.Icon(color='green')
                    ).add_to(m)
            except Exception:
                continue
    m.save('user_locations_map.html')
    return m

# Map Two: Time-series map of extracted tweet locations
def create_time_series_map(df):
    m = folium.Map(location=[20, 0], zoom_start=2)
    features = []
    for idx, row in df.iterrows():
        for coordinate in row['location_coordinates']:
            if coordinate:
                lat, lon, loc_name = coordinate
                time_str = row['timestamp'].strftime('%Y-%m-%d %H:%M:%S')
                feature = {
                    'type': 'Feature',
                    'geometry': {
                        'type': 'Point',
                        'coordinates': [lon, lat]
                    },
                    'properties': {
                        'time': time_str,
                        'popup': f"User: {row['username']}<br>Tweet: {row['text'][:100]}...<br>Location: {loc_name}"
                    }
                }
                features.append(feature)
    if features:
        TimestampedGeoJson(
            {'type': 'FeatureCollection', 'features': features},
            period='PT1H',
            add_last_point=True,
            auto_play=True,
            loop=False,
            max_speed=1,
            loop_button=True,
            time_slider_drag_update=True
        ).add_to(m)
    m.save('tweet_time_series_map.html')
    return m

# Step 13: Topic modeling using LDA (unchanged from before)
def perform_topic_modeling(texts, n_topics=5):
    count_vect = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = count_vect.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(doc_term_matrix)
    feature_names = count_vect.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[:-11:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append((f"Topic {topic_idx+1}", top_words))
    return topics

# Step 14: Additional visualizations for top entities
def create_top_visualizations(df):
    # Top languages
    plt.figure(figsize=(10, 6))
    df['language_code'].value_counts().plot(kind='bar')
    plt.title('Top Languages')
    plt.xlabel('Language Code')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig('top_languages.png')
    plt.close()
    
    # Top mentioned users
    all_mentions = []
    for mentions in df['mentioned_users']:
        all_mentions.extend(mentions)
    if all_mentions:
        mention_counts = pd.Series(all_mentions).value_counts().head(10)
        plt.figure(figsize=(10, 6))
        mention_counts.plot(kind='bar')
        plt.title('Top Mentioned Users')
        plt.xlabel('Username')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig('top_mentions.png')
        plt.close()
    
    # Political inclination distribution (pie chart)
    plt.figure(figsize=(10, 6))
    df['political_inclination'].value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title('Political Inclination Distribution')
    plt.tight_layout()
    plt.savefig('political_distribution.png')
    plt.close()
    
    # Scatter plot: Sentiment vs. Political inclination
    plt.figure(figsize=(10, 6))
    colors = {'left-leaning': 'blue', 'right-leaning': 'red', 'neutral': 'green'}
    for inclination, color in colors.items():
        subset = df[df['political_inclination'] == inclination]
        plt.scatter(
            subset.index, 
            subset['sentence_sentiment'],
            c=color,
            label=inclination,
            alpha=0.6
        )
    plt.title('Sentiment vs Political Inclination')
    plt.xlabel('Tweet Index (Time Ordered)')
    plt.ylabel('Sentiment Score')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('sentiment_vs_politics.png')
    plt.close()

# ------------------ Execute All Analysis Steps ------------------

print("\nCreating wordcloud...")
create_wordcloud(tweets_df['keywords'])

print("Creating co-occurrence heatmap...")
create_cooccurrence_heatmap(tweets_df)

print("Creating user location map...")
create_user_location_map(users_df)

print("Creating tweet time-series map...")
create_time_series_map(tweets_df)

print("Plotting sentiment trend line...")
plot_trend_line(tweets_df, 'sentence_sentiment', 'Tweet Sentiment Trend', 'sentiment_trend.png')

print("Plotting daily tweet count trend...")
plot_tweet_count_trend(tweets_df)

print("Performing topic modeling...")
topics = perform_topic_modeling(tweets_df['translated_text'].fillna(''), n_topics=5)
for topic_name, top_words in topics:
    print(f"{topic_name}: {', '.join(top_words)}")

print("Creating additional visualizations...")
create_top_visualizations(tweets_df)

# Generate comprehensive AI summary report
print("\nGenerating AI summary report...")

total_tweets = len(tweets_df)
unique_users = tweets_df['username'].nunique()
avg_sentiment = tweets_df['sentence_sentiment'].mean()
political_counts = tweets_df['political_inclination'].value_counts()

report = f"""
# Twitter Data Analysis Report

## Overview
- Total Tweets Analyzed: {total_tweets}
- Unique Users: {unique_users}
- Average Sentiment Score: {avg_sentiment:.2f}

## Political Distribution
{political_counts.to_string()}

## Top Topics
"""

for topic_name, top_words in topics:
    report += f"- {topic_name}: {', '.join(top_words)}\n"

report += """
## Key Findings
- Non-English tweets are detected using langdetect and translated to English via googletrans.
- Two maps are generated: one showing user locations and one as a time-series map of tweet-extracted locations.
- Linear regression on the tweet index (numeric sequence) provides a trend line for sentiment, and daily tweet counts are analyzed.
- Topic modeling (LDA) identifies distinct themes within the tweets.

## Generated Visualizations
- Wordcloud of keywords
- Keyword co-occurrence heatmap
- User locations map
- Time-series map of tweet locations
- Sentiment trend line (with linear regression)
- Daily tweet count trend
- Political inclination distribution and sentiment vs. politics scatter plot
"""

with open('twitter_analysis_report.md', 'w') as f:
    f.write(report)

print("Analysis complete! All visualizations and the report have been saved.")

# Create a final enriched DataFrame and save it
final_df = tweets_df[['username', 'text', 'timestamp', 'language_code', 'translated_text', 
                        'keywords', 'sentence_sentiment', 'extracted_locations', 
                        'mentioned_users', 'political_inclination']]
final_df.to_csv('analyzed_tweets.csv', index=False)
print("\nEnriched dataset saved to 'analyzed_tweets.csv'")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tweets Dataset:
Shape: (33, 9)

Users Dataset:
Shape: (27, 6)

Creating wordcloud...
Creating co-occurrence heatmap...
Creating user location map...
Creating tweet time-series map...
Plotting sentiment trend line...
Plotting daily tweet count trend...
Performing topic modeling...
Topic 1: ww2, churchill, did, military, white, winston, don, people, generation, order
Topic 2: europe, ww2, right, ww3, need, nazis, ussr, america, don, going
Topic 3: ww3, wants, fighting, won, weapons, going, ukraine, people, nation, hasn
Topic 4: ww2, world, like, think, need, europe, war, america, saved, don
Topic 5: ww2, war, start, world, won, allied, grandfather, allies, order, ussr
Creating additional visualizations...

Generating AI summary report...
Analysis complete! All visualizations and the report have been saved.

Enriched dataset saved to 'analyzed_tweets.csv'


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import folium
from folium.plugins import TimestampedGeoJson
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
from wordcloud import WordCloud
from datetime import datetime
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

# Import language detection and translation libraries
from langdetect import detect
from googletrans import Translator

# Import spaCy and geopy for location extraction and geocoding
import spacy
from geopy.geocoders import Nominatim

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Initialize geopy Nominatim geocoder and googletrans translator
geolocator = Nominatim(user_agent="tweet_geocoder")
translator = Translator()

# Load datasets
tweets_df = pd.read_csv('tweets.csv')
users_df = pd.read_csv('users.csv')

# Display basic dataset info
print("Tweets Dataset:")
print(f"Shape: {tweets_df.shape}")
print("\nUsers Dataset:")
print(f"Shape: {users_df.shape}")

# Step 1: Sort tweets by timestamp
tweets_df['timestamp'] = pd.to_datetime(tweets_df['timestamp'])
tweets_df = tweets_df.sort_values(by='timestamp')

# Step 2: Detect language using langdetect
def detect_language(text):
    if isinstance(text, str) and text.strip():
        try:
            return detect(text)
        except Exception:
            return 'en'
    return 'en'

tweets_df['language_code'] = tweets_df['text'].apply(detect_language)

# Step 3: Translate non-English text to English using googletrans
def translate_to_english(row):
    if row['language_code'] != 'en' and isinstance(row['text'], str):
        try:
            translation = translator.translate(row['text'], dest='en')
            return translation.text
        except Exception:
            return row['text']
    return row['text']

tweets_df['translated_text'] = tweets_df.apply(translate_to_english, axis=1)
analysis_text = tweets_df['translated_text'].fillna('')

# Step 4: Extract keywords using TF-IDF
# Convert NLTK stopwords (set) to a list
stop_words = list(stopwords.words('english'))
vectorizer = TfidfVectorizer(max_features=5, stop_words=stop_words)
tfidf_matrix = vectorizer.fit_transform(analysis_text)
feature_names = vectorizer.get_feature_names_out()

def get_top_keywords(tfidf_row, feature_names):
    indices = np.argsort(tfidf_row)[::-1]
    top_keywords = [(feature_names[i], tfidf_row[i]) for i in indices if tfidf_row[i] > 0]
    return top_keywords

tweets_df['keywords'] = [get_top_keywords(tfidf_matrix[i].toarray()[0], feature_names) 
                           for i in range(tfidf_matrix.shape[0])]

# Step 5: Compute sentiment scores (for each tweet and for keywords)
def get_sentiment(text):
    if isinstance(text, str) and text.strip():
        return TextBlob(text).sentiment.polarity
    return 0

tweets_df['keyword_sentiments'] = tweets_df['keywords'].apply(
    lambda kw_list: [(kw, score, get_sentiment(kw)) for kw, score in kw_list] if isinstance(kw_list, list) else []
)
tweets_df['sentence_sentiment'] = tweets_df['translated_text'].apply(get_sentiment)

# Step 6: Extract geographic locations using spaCy NER and geocode them
def extract_locations(text):
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    # Extract entities labeled as GPE (Countries, cities) or LOC
    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
    return list(set(locations))  # Return unique locations

def get_coordinates(location):
    try:
        loc = geolocator.geocode(location)
        if loc:
            return (loc.latitude, loc.longitude, location)
        return None
    except Exception:
        return None

tweets_df['extracted_locations'] = tweets_df['translated_text'].apply(extract_locations)
tweets_df['location_coordinates'] = tweets_df['extracted_locations'].apply(
    lambda locs: [get_coordinates(loc) for loc in locs if loc]
)
tweets_df['location_coordinates'] = tweets_df['location_coordinates'].apply(
    lambda coords: [c for c in coords if c is not None]
)

# Step 7: Extract mentioned users using regex
def extract_mentions(text):
    if not isinstance(text, str):
        return []
    mentions = re.findall(r'@(\w+)', text)
    return list(set(mentions))

tweets_df['mentioned_users'] = tweets_df['text'].apply(extract_mentions)

# Step 8: Assess political inclination via a keyword-based approach
def assess_political_inclination(text):
    if not isinstance(text, str):
        return 'neutral'
    left_keywords = ['democrat', 'liberal', 'progressive', 'left', 'biden', 'harris']
    right_keywords = ['republican', 'conservative', 'maga', 'trump', 'right']
    text_lower = text.lower()
    left_count = sum(1 for kw in left_keywords if kw in text_lower)
    right_count = sum(1 for kw in right_keywords if kw in text_lower)
    if left_count > right_count:
        return 'left-leaning'
    elif right_count > left_count:
        return 'right-leaning'
    else:
        return 'neutral'

tweets_df['political_inclination'] = tweets_df['translated_text'].apply(assess_political_inclination)

# Step 9: Create wordcloud and co-occurrence heatmap of keywords
def create_wordcloud(keywords):
    word_freq = {}
    for tweet_keywords in keywords:
        for word, score in tweet_keywords:
            word_freq[word] = word_freq.get(word, 0) + score
    if word_freq:
        wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
        plt.figure(figsize=(10, 5))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig('wordcloud.png')
        plt.close()

def create_cooccurrence_heatmap(df):
    all_keywords = set()
    for kw_list in df['keywords']:
        all_keywords.update([kw for kw, _ in kw_list])
    all_keywords = list(all_keywords)
    
    cooccurrence = np.zeros((len(all_keywords), len(all_keywords)))
    for kw_list in df['keywords']:
        words = [kw for kw, _ in kw_list]
        for i, word1 in enumerate(all_keywords):
            for j, word2 in enumerate(all_keywords):
                if word1 in words and word2 in words and i != j:
                    cooccurrence[i, j] += 1
    plt.figure(figsize=(10, 8))
    sns.heatmap(cooccurrence, xticklabels=all_keywords, yticklabels=all_keywords, cmap='Blues')
    plt.title('Keyword Co-occurrence Heatmap')
    plt.tight_layout()
    plt.savefig('cooccurrence_heatmap.png')
    plt.close()

# Step 10: Create a linear regression trend plot for tweet sentiment over a numeric sequence
def plot_trend_line(df, column, title, filename):
    # Create a numeric sequence based on the tweet order (index)
    X = np.arange(len(df)).reshape(-1, 1)
    Y = df[column].values.reshape(-1, 1)
    model = LinearRegression()
    model.fit(X, Y)
    trend = model.predict(X)
    plt.figure(figsize=(10, 6))
    plt.scatter(X, Y, label='Data', alpha=0.5)
    plt.plot(X, trend, color='red', label='Trend line')
    plt.title(title)
    plt.xlabel("Tweet Index (Numeric Sequence)")
    plt.ylabel(column)
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Step 11: Plot time series for number of tweets (daily count) with linear regression trend
def plot_tweet_count_trend(df):
    df_copy = df.copy()
    df_copy['date'] = df_copy['timestamp'].dt.date
    tweet_count = df_copy.groupby('date').size().reset_index(name='count')
    X = np.arange(len(tweet_count)).reshape(-1, 1)
    Y = tweet_count['count'].values.reshape(-1, 1)
    model = LinearRegression()
    model.fit(X, Y)
    trend = model.predict(X)
    plt.figure(figsize=(10, 6))
    plt.scatter(tweet_count['date'], Y, label='Daily Tweet Count', alpha=0.5)
    plt.plot(tweet_count['date'], trend, color='red', label='Trend Line')
    plt.title('Daily Tweet Count with Trend Line')
    plt.xlabel('Date')
    plt.ylabel('Tweet Count')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('daily_tweet_count_trend.png')
    plt.close()

# Step 12: Create maps
# Map One: Map of user locations (using users_df and their "location" field)
def create_user_location_map(users_df):
    m = folium.Map(location=[20, 0], zoom_start=2)
    # Assumes users_df contains columns 'username' and 'location'
    for idx, row in users_df.iterrows():
        if pd.notnull(row.get('location')):
            try:
                loc = geolocator.geocode(row['location'])
                if loc:
                    folium.Marker(
                        location=[loc.latitude, loc.longitude],
                        popup=f"User: {row['username']}<br>Location: {row['location']}",
                        icon=folium.Icon(color='green')
                    ).add_to(m)
            except Exception:
                continue
    m.save('user_locations_map.html')
    return m

# Map Two: Time-series map of extracted tweet locations
def create_time_series_map(df):
    m = folium.Map(location=[20, 0], zoom_start=2)
    features = []
    for idx, row in df.iterrows():
        for coordinate in row['location_coordinates']:
            if coordinate:
                lat, lon, loc_name = coordinate
                time_str = row['timestamp'].strftime('%Y-%m-%d %H:%M:%S')
                feature = {
                    'type': 'Feature',
                    'geometry': {
                        'type': 'Point',
                        'coordinates': [lon, lat]
                    },
                    'properties': {
                        'time': time_str,
                        'popup': f"User: {row['username']}<br>Tweet: {row['text'][:100]}...<br>Location: {loc_name}"
                    }
                }
                features.append(feature)
    if features:
        TimestampedGeoJson(
            {'type': 'FeatureCollection', 'features': features},
            period='PT1H',
            add_last_point=True,
            auto_play=True,
            loop=False,
            max_speed=1,
            loop_button=True,
            time_slider_drag_update=True
        ).add_to(m)
    m.save('tweet_time_series_map.html')
    return m

# Step 13: Topic modeling using LDA (unchanged from before)
def perform_topic_modeling(texts, n_topics=5):
    count_vect = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = count_vect.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(doc_term_matrix)
    feature_names = count_vect.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[:-11:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append((f"Topic {topic_idx+1}", top_words))
    return topics

# Step 14: Additional visualizations for top entities
def create_top_visualizations(df):
    # Top languages
    plt.figure(figsize=(10, 6))
    df['language_code'].value_counts().plot(kind='bar')
    plt.title('Top Languages')
    plt.xlabel('Language Code')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig('top_languages.png')
    plt.close()
    
    # Top mentioned users
    all_mentions = []
    for mentions in df['mentioned_users']:
        all_mentions.extend(mentions)
    if all_mentions:
        mention_counts = pd.Series(all_mentions).value_counts().head(10)
        plt.figure(figsize=(10, 6))
        mention_counts.plot(kind='bar')
        plt.title('Top Mentioned Users')
        plt.xlabel('Username')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig('top_mentions.png')
        plt.close()
    
    # Political inclination distribution (pie chart)
    plt.figure(figsize=(10, 6))
    df['political_inclination'].value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title('Political Inclination Distribution')
    plt.tight_layout()
    plt.savefig('political_distribution.png')
    plt.close()
    
    # Scatter plot: Sentiment vs. Political inclination
    plt.figure(figsize=(10, 6))
    colors = {'left-leaning': 'blue', 'right-leaning': 'red', 'neutral': 'green'}
    for inclination, color in colors.items():
        subset = df[df['political_inclination'] == inclination]
        plt.scatter(
            subset.index, 
            subset['sentence_sentiment'],
            c=color,
            label=inclination,
            alpha=0.6
        )
    plt.title('Sentiment vs Political Inclination')
    plt.xlabel('Tweet Index (Time Ordered)')
    plt.ylabel('Sentiment Score')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('sentiment_vs_politics.png')
    plt.close()

# ------------------ Execute All Analysis Steps ------------------

print("\nCreating wordcloud...")
create_wordcloud(tweets_df['keywords'])

print("Creating co-occurrence heatmap...")
create_cooccurrence_heatmap(tweets_df)

print("Creating user location map...")
create_user_location_map(users_df)

print("Creating tweet time-series map...")
create_time_series_map(tweets_df)

print("Plotting sentiment trend line...")
plot_trend_line(tweets_df, 'sentence_sentiment', 'Tweet Sentiment Trend', 'sentiment_trend.png')

print("Plotting daily tweet count trend...")
plot_tweet_count_trend(tweets_df)

print("Performing topic modeling...")
topics = perform_topic_modeling(tweets_df['translated_text'].fillna(''), n_topics=5)
for topic_name, top_words in topics:
    print(f"{topic_name}: {', '.join(top_words)}")

print("Creating additional visualizations...")
create_top_visualizations(tweets_df)

# Generate comprehensive AI summary report
print("\nGenerating AI summary report...")

total_tweets = len(tweets_df)
unique_users = tweets_df['username'].nunique()
avg_sentiment = tweets_df['sentence_sentiment'].mean()
political_counts = tweets_df['political_inclination'].value_counts()

report = f"""
# Twitter Data Analysis Report

## Overview
- Total Tweets Analyzed: {total_tweets}
- Unique Users: {unique_users}
- Average Sentiment Score: {avg_sentiment:.2f}

## Political Distribution
{political_counts.to_string()}

## Top Topics
"""

for topic_name, top_words in topics:
    report += f"- {topic_name}: {', '.join(top_words)}\n"

report += """
## Key Findings
- Non-English tweets are detected using langdetect and translated to English via googletrans.
- Two maps are generated: one showing user locations and one as a time-series map of tweet-extracted locations.
- Linear regression on the tweet index (numeric sequence) provides a trend line for sentiment, and daily tweet counts are analyzed.
- Topic modeling (LDA) identifies distinct themes within the tweets.

## Generated Visualizations
- Wordcloud of keywords
- Keyword co-occurrence heatmap
- User locations map
- Time-series map of tweet locations
- Sentiment trend line (with linear regression)
- Daily tweet count trend
- Political inclination distribution and sentiment vs. politics scatter plot
"""

with open('twitter_analysis_report.md', 'w') as f:
    f.write(report)

print("Analysis complete! All visualizations and the report have been saved.")

# Create a final enriched DataFrame and save it
final_df = tweets_df[['username', 'text', 'timestamp', 'language_code', 'translated_text', 
                        'keywords', 'sentence_sentiment', 'extracted_locations', 
                        'mentioned_users', 'political_inclination']]
final_df.to_csv('analyzed_tweets.csv', index=False)
print("\nEnriched dataset saved to 'analyzed_tweets.csv'")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tweets Dataset:
Shape: (33, 9)

Users Dataset:
Shape: (27, 6)

Creating wordcloud...
Creating co-occurrence heatmap...
Creating user location map...
Creating tweet time-series map...
Plotting sentiment trend line...
Plotting daily tweet count trend...
Performing topic modeling...
Topic 1: ww2, churchill, did, military, white, winston, don, people, generation, order
Topic 2: europe, ww2, right, ww3, need, nazis, ussr, america, don, going
Topic 3: ww3, wants, fighting, won, weapons, going, ukraine, people, nation, hasn
Topic 4: ww2, world, like, think, need, europe, war, america, saved, don
Topic 5: ww2, war, start, world, won, allied, grandfather, allies, order, ussr
Creating additional visualizations...

Generating AI summary report...
Analysis complete! All visualizations and the report have been saved.

Enriched dataset saved to 'analyzed_tweets.csv'


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import folium
from folium.plugins import TimestampedGeoJson
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
from wordcloud import WordCloud
from datetime import datetime
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

# Import language detection and translation libraries
from langdetect import detect
from googletrans import Translator

# Import spaCy and geopy for location extraction and geocoding
import spacy
from geopy.geocoders import Nominatim

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Initialize geopy Nominatim geocoder and googletrans translator
geolocator = Nominatim(user_agent="tweet_geocoder")
translator = Translator()

# Load datasets
tweets_df = pd.read_csv('tweets.csv')
users_df = pd.read_csv('users.csv')

# Display basic dataset info
print("Tweets Dataset:")
print(f"Shape: {tweets_df.shape}")
print("\nUsers Dataset:")
print(f"Shape: {users_df.shape}")

# Step 1: Sort tweets by timestamp
tweets_df['timestamp'] = pd.to_datetime(tweets_df['timestamp'])
tweets_df = tweets_df.sort_values(by='timestamp')

# Step 2: Detect language using langdetect
def detect_language(text):
    if isinstance(text, str) and text.strip():
        try:
            return detect(text)
        except Exception:
            return 'en'
    return 'en'

tweets_df['language_code'] = tweets_df['text'].apply(detect_language)

# Step 3: Translate non-English text to English using googletrans
def translate_to_english(row):
    if row['language_code'] != 'en' and isinstance(row['text'], str):
        try:
            translation = translator.translate(row['text'], dest='en')
            return translation.text
        except Exception:
            return row['text']
    return row['text']

tweets_df['translated_text'] = tweets_df.apply(translate_to_english, axis=1)
analysis_text = tweets_df['translated_text'].fillna('')

# Step 4: Extract keywords using TF-IDF
stop_words = list(stopwords.words('english'))  # convert stopwords to list
vectorizer = TfidfVectorizer(max_features=5, stop_words=stop_words)
tfidf_matrix = vectorizer.fit_transform(analysis_text)
feature_names = vectorizer.get_feature_names_out()

def get_top_keywords(tfidf_row, feature_names):
    indices = np.argsort(tfidf_row)[::-1]
    top_keywords = [(feature_names[i], tfidf_row[i]) for i in indices if tfidf_row[i] > 0]
    return top_keywords

tweets_df['keywords'] = [get_top_keywords(tfidf_matrix[i].toarray()[0], feature_names) 
                           for i in range(tfidf_matrix.shape[0])]

# Step 5: Compute sentiment scores (for each tweet and for keywords)
def get_sentiment(text):
    if isinstance(text, str) and text.strip():
        return TextBlob(text).sentiment.polarity
    return 0

tweets_df['keyword_sentiments'] = tweets_df['keywords'].apply(
    lambda kw_list: [(kw, score, get_sentiment(kw)) for kw, score in kw_list] if isinstance(kw_list, list) else []
)
tweets_df['sentence_sentiment'] = tweets_df['translated_text'].apply(get_sentiment)

# Step 6: Extract geographic locations from tweets using spaCy NER and geocode them
def extract_locations(text):
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    # Extract entities labeled as GPE (Countries, cities) or LOC
    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
    return list(set(locations))  # Return unique locations

def get_coordinates(location):
    try:
        loc = geolocator.geocode(location)
        if loc:
            return (loc.latitude, loc.longitude, location)
        return None
    except Exception:
        return None

tweets_df['extracted_locations'] = tweets_df['translated_text'].apply(extract_locations)
tweets_df['location_coordinates'] = tweets_df['extracted_locations'].apply(
    lambda locs: [get_coordinates(loc) for loc in locs if loc]
)
tweets_df['location_coordinates'] = tweets_df['location_coordinates'].apply(
    lambda coords: [c for c in coords if c is not None]
)

# Step 7: Extract mentioned users using regex
def extract_mentions(text):
    if not isinstance(text, str):
        return []
    mentions = re.findall(r'@(\w+)', text)
    return list(set(mentions))

tweets_df['mentioned_users'] = tweets_df['text'].apply(extract_mentions)

# Step 8: Assess political inclination via a keyword-based approach
def assess_political_inclination(text):
    if not isinstance(text, str):
        return 'neutral'
    left_keywords = ['democrat', 'liberal', 'progressive', 'left', 'biden', 'harris']
    right_keywords = ['republican', 'conservative', 'maga', 'trump', 'right']
    text_lower = text.lower()
    left_count = sum(1 for kw in left_keywords if kw in text_lower)
    right_count = sum(1 for kw in right_keywords if kw in text_lower)
    if left_count > right_count:
        return 'left-leaning'
    elif right_count > left_count:
        return 'right-leaning'
    else:
        return 'neutral'

tweets_df['political_inclination'] = tweets_df['translated_text'].apply(assess_political_inclination)

# Step 9: Create wordcloud and co-occurrence heatmap of keywords
def create_wordcloud(keywords):
    word_freq = {}
    for tweet_keywords in keywords:
        for word, score in tweet_keywords:
            word_freq[word] = word_freq.get(word, 0) + score
    if word_freq:
        wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
        plt.figure(figsize=(10, 5))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig('wordcloud.png')
        plt.close()

def create_cooccurrence_heatmap(df):
    all_keywords = set()
    for kw_list in df['keywords']:
        all_keywords.update([kw for kw, _ in kw_list])
    all_keywords = list(all_keywords)
    
    cooccurrence = np.zeros((len(all_keywords), len(all_keywords)))
    for kw_list in df['keywords']:
        words = [kw for kw, _ in kw_list]
        for i, word1 in enumerate(all_keywords):
            for j, word2 in enumerate(all_keywords):
                if word1 in words and word2 in words and i != j:
                    cooccurrence[i, j] += 1
    plt.figure(figsize=(10, 8))
    sns.heatmap(cooccurrence, xticklabels=all_keywords, yticklabels=all_keywords, cmap='Blues')
    plt.title('Keyword Co-occurrence Heatmap')
    plt.tight_layout()
    plt.savefig('cooccurrence_heatmap.png')
    plt.close()

# Step 10: Plot linear regression trend for tweet sentiment over a numeric sequence
def plot_trend_line(df, column, title, filename):
    X = np.arange(len(df)).reshape(-1, 1)  # Numeric tweet index
    Y = df[column].values.reshape(-1, 1)
    model = LinearRegression()
    model.fit(X, Y)
    trend = model.predict(X)
    plt.figure(figsize=(10, 6))
    plt.scatter(X, Y, label='Data', alpha=0.5)
    plt.plot(X, trend, color='red', label='Trend line')
    plt.title(title)
    plt.xlabel("Tweet Index (Numeric Sequence)")
    plt.ylabel(column)
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Step 11: Plot time series for daily tweet count with linear regression trend
def plot_tweet_count_trend(df):
    df_copy = df.copy()
    df_copy['date'] = df_copy['timestamp'].dt.date
    tweet_count = df_copy.groupby('date').size().reset_index(name='count')
    X = np.arange(len(tweet_count)).reshape(-1, 1)
    Y = tweet_count['count'].values.reshape(-1, 1)
    model = LinearRegression()
    model.fit(X, Y)
    trend = model.predict(X)
    plt.figure(figsize=(10, 6))
    plt.scatter(tweet_count['date'], Y, label='Daily Tweet Count', alpha=0.5)
    plt.plot(tweet_count['date'], trend, color='red', label='Trend Line')
    plt.title('Daily Tweet Count with Trend Line')
    plt.xlabel('Date')
    plt.ylabel('Tweet Count')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('daily_tweet_count_trend.png')
    plt.close()

# Step 12: Create maps
# Map One: Map of user locations (using users_df and their "location" field)
def create_user_location_map(users_df):
    m = folium.Map(location=[20, 0], zoom_start=2)
    # Assumes users_df contains columns 'username' and 'location'
    for idx, row in users_df.iterrows():
        if pd.notnull(row.get('location')):
            try:
                loc = geolocator.geocode(row['location'])
                if loc:
                    folium.Marker(
                        location=[loc.latitude, loc.longitude],
                        popup=f"User: {row['username']}<br>Location: {row['location']}",
                        icon=folium.Icon(color='green')
                    ).add_to(m)
            except Exception:
                continue
    m.save('user_locations_map.html')
    return m

# Map Two: Time-series map of tweet-extracted locations (using geocoded locations from tweet texts)
def create_tweet_time_series_map(df):
    m = folium.Map(location=[20, 0], zoom_start=2)
    features = []
    for idx, row in df.iterrows():
        # Use only the geocoded tweet locations (extracted from tweet text)
        for coordinate in row['location_coordinates']:
            if coordinate:
                lat, lon, loc_name = coordinate
                time_str = row['timestamp'].strftime('%Y-%m-%d %H:%M:%S')
                feature = {
                    'type': 'Feature',
                    'geometry': {
                        'type': 'Point',
                        'coordinates': [lon, lat]
                    },
                    'properties': {
                        'time': time_str,
                        'popup': f"User: {row['username']}<br>Tweet: {row['text'][:100]}...<br>Location: {loc_name}"
                    }
                }
                features.append(feature)
    if features:
        TimestampedGeoJson(
            {'type': 'FeatureCollection', 'features': features},
            period='PT1H',
            add_last_point=True,
            auto_play=True,
            loop=False,
            max_speed=1,
            loop_button=True,
            time_slider_drag_update=True
        ).add_to(m)
    m.save('tweet_time_series_map.html')
    return m

# Step 13: Topic modeling using LDA
def perform_topic_modeling(texts, n_topics=5):
    count_vect = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = count_vect.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(doc_term_matrix)
    feature_names = count_vect.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[:-11:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append((f"Topic {topic_idx+1}", top_words))
    return topics

# Step 14: Additional visualizations for top entities
def create_top_visualizations(df):
    # Top languages
    plt.figure(figsize=(10, 6))
    df['language_code'].value_counts().plot(kind='bar')
    plt.title('Top Languages')
    plt.xlabel('Language Code')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig('top_languages.png')
    plt.close()
    
    # Top mentioned users
    all_mentions = []
    for mentions in df['mentioned_users']:
        all_mentions.extend(mentions)
    if all_mentions:
        mention_counts = pd.Series(all_mentions).value_counts().head(10)
        plt.figure(figsize=(10, 6))
        mention_counts.plot(kind='bar')
        plt.title('Top Mentioned Users')
        plt.xlabel('Username')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig('top_mentions.png')
        plt.close()
    
    # Political inclination distribution (pie chart)
    plt.figure(figsize=(10, 6))
    df['political_inclination'].value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title('Political Inclination Distribution')
    plt.tight_layout()
    plt.savefig('political_distribution.png')
    plt.close()
    
    # Scatter plot: Sentiment vs. Political inclination
    plt.figure(figsize=(10, 6))
    colors = {'left-leaning': 'blue', 'right-leaning': 'red', 'neutral': 'green'}
    for inclination, color in colors.items():
        subset = df[df['political_inclination'] == inclination]
        plt.scatter(
            subset.index, 
            subset['sentence_sentiment'],
            c=color,
            label=inclination,
            alpha=0.6
        )
    plt.title('Sentiment vs Political Inclination')
    plt.xlabel('Tweet Index (Time Ordered)')
    plt.ylabel('Sentiment Score')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('sentiment_vs_politics.png')
    plt.close()

# ------------------ Execute Analysis Steps ------------------

print("\nCreating wordcloud...")
create_wordcloud(tweets_df['keywords'])

print("Creating co-occurrence heatmap...")
create_cooccurrence_heatmap(tweets_df)

print("Creating user location map...")
create_user_location_map(users_df)

print("Creating tweet time-series map...")
create_tweet_time_series_map(tweets_df)

print("Plotting sentiment trend line...")
plot_trend_line(tweets_df, 'sentence_sentiment', 'Tweet Sentiment Trend', 'sentiment_trend.png')

print("Plotting daily tweet count trend...")
plot_tweet_count_trend(tweets_df)

print("Performing topic modeling...")
topics = perform_topic_modeling(tweets_df['translated_text'].fillna(''), n_topics=5)
for topic_name, top_words in topics:
    print(f"{topic_name}: {', '.join(top_words)}")

print("Creating additional visualizations...")
create_top_visualizations(tweets_df)

# Generate comprehensive AI summary report
print("\nGenerating AI summary report...")

total_tweets = len(tweets_df)
unique_users = tweets_df['username'].nunique()
avg_sentiment = tweets_df['sentence_sentiment'].mean()
political_counts = tweets_df['political_inclination'].value_counts()

report = f"""
# Twitter Data Analysis Report

## Overview
- Total Tweets Analyzed: {total_tweets}
- Unique Users: {unique_users}
- Average Sentiment Score: {avg_sentiment:.2f}

## Political Distribution
{political_counts.to_string()}

## Top Topics
"""

for topic_name, top_words in topics:
    report += f"- {topic_name}: {', '.join(top_words)}\n"

report += """
## Key Findings
- Non-English tweets are detected using langdetect and translated to English via googletrans.
- Two maps are generated: one showing user locations and one (time-series) showing tweet-extracted geolocations.
- Linear regression on tweet index provides a trend line for sentiment, and daily tweet counts are analyzed.
- Topic modeling (LDA) identifies distinct themes within the tweets.

## Generated Visualizations
- Wordcloud of keywords
- Keyword co-occurrence heatmap
- User locations map
- Time-series map of tweet-extracted locations
- Sentiment trend line (with linear regression)
- Daily tweet count trend
- Political inclination distribution and sentiment vs. politics scatter plot
"""

with open('twitter_analysis_report.md', 'w') as f:
    f.write(report)

print("Analysis complete! All visualizations and the report have been saved.")

# Create a final enriched DataFrame and save it
final_df = tweets_df[['username', 'text', 'timestamp', 'language_code', 'translated_text', 
                        'keywords', 'sentence_sentiment', 'extracted_locations', 
                        'mentioned_users', 'political_inclination']]
final_df.to_csv('analyzed_tweets.csv', index=False)
print("\nEnriched dataset saved to 'analyzed_tweets.csv'")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tweets Dataset:
Shape: (33, 9)

Users Dataset:
Shape: (27, 6)

Creating wordcloud...
Creating co-occurrence heatmap...
Creating user location map...
Creating tweet time-series map...
Plotting sentiment trend line...
Plotting daily tweet count trend...
Performing topic modeling...
Topic 1: ww2, churchill, did, military, white, winston, don, people, generation, order
Topic 2: europe, ww2, right, ww3, need, nazis, ussr, america, don, going
Topic 3: ww3, wants, fighting, won, weapons, going, ukraine, people, nation, hasn
Topic 4: ww2, world, like, think, need, europe, war, america, saved, don
Topic 5: ww2, war, start, world, won, allied, grandfather, allies, order, ussr
Creating additional visualizations...

Generating AI summary report...
Analysis complete! All visualizations and the report have been saved.

Enriched dataset saved to 'analyzed_tweets.csv'


In [8]:
import pandas as pd
import networkx as nx
import google.generativeai as genai
from pyvis.network import Network
import webbrowser
import json
import re
from collections import Counter

# Set your API key
genai.configure(api_key="AIzaSyBQjwl1U4208zTqqoOvAhjo98ypbCs8Pk4")

# Define the model
model = genai.GenerativeModel("gemini-2.0-flash-exp")

def preprocess_text(text):
    """
    Preprocess tweet text to extract meaningful terms.
    """
    if not isinstance(text, str):
        return []
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs, mentions, and special characters
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize
    tokens = text.split()
    
    # Remove stop words (simplified version)
    stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'if', 'in', 'on', 'at', 'to', 'for', 'with', 
                 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 
                 'does', 'did', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'this', 'that', 'of', 
                 'from', 'by', 'my', 'your', 'his', 'her', 'its', 'our', 'their'}
    
    # Only keep tokens that are not stop words and have length > 2
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    
    return tokens

def extract_key_terms(tweets_df, min_count=2, max_terms=500):
    """
    Extract key terms from all tweets based on frequency.
    """
    all_terms = []
    for text in tweets_df['text']:
        all_terms.extend(preprocess_text(text))
    
    # Count term frequencies
    term_counts = Counter(all_terms)
    
    # Filter by minimum count
    key_terms = {term: count for term, count in term_counts.items() if count >= min_count}
    
    # Limit to max number of terms (take most frequent)
    if len(key_terms) > max_terms:
        key_terms = dict(sorted(key_terms.items(), key=lambda x: x[1], reverse=True)[:max_terms])
    
    return key_terms

def visualize_term_cooccurrence(df, min_term_count=2, max_terms=200, min_edge_weight=2, output_file="term_cooccurrence_graph.html"):
    """
    Creates and visualizes a term co-occurrence network from tweets.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing a 'text' column with tweet content.
    min_term_count (int): Minimum frequency for a term to be included.
    max_terms (int): Maximum number of terms to include.
    min_edge_weight (int): Minimum co-occurrence weight for an edge to be included.
    output_file (str): Name of the output HTML file.
    """
    # Extract key terms
    key_terms = extract_key_terms(df, min_count=min_term_count, max_terms=max_terms)
    print(f"Identified {len(key_terms)} key terms")
    
    # Create a graph
    G = nx.Graph()
    
    # Process each tweet
    for _, row in df.iterrows():
        if not isinstance(row['text'], str):
            continue
        
        # Extract terms from tweet
        tweet_terms = preprocess_text(row['text'])
        
        # Filter to only include key terms
        tweet_terms = [term for term in tweet_terms if term in key_terms]
        
        # Add co-occurrences to graph
        for i in range(len(tweet_terms)):
            for j in range(i+1, len(tweet_terms)):
                term1, term2 = tweet_terms[i], tweet_terms[j]
                
                # Add nodes if they don't exist
                if not G.has_node(term1):
                    G.add_node(term1, count=key_terms[term1])
                if not G.has_node(term2):
                    G.add_node(term2, count=key_terms[term2])
                
                # Add or update edge weight
                if G.has_edge(term1, term2):
                    G[term1][term2]['weight'] += 1
                else:
                    G.add_edge(term1, term2, weight=1)
    
    # Remove edges with weight below threshold
    edges_to_remove = [(u, v) for u, v, d in G.edges(data=True) if d['weight'] < min_edge_weight]
    G.remove_edges_from(edges_to_remove)
    
    # Remove isolated nodes
    isolated_nodes = list(nx.isolates(G))
    G.remove_nodes_from(isolated_nodes)
    
    if len(G.nodes()) == 0:
        print("No significant co-occurrences found with current parameters.")
        return None, None
    
    # Identify the main node (highest degree)
    main_node, main_degree = max(G.degree(), key=lambda x: x[1])
    print(f"Main term: '{main_node}' with {main_degree} connections")
    
    # Create JSON variables for nodes and their degree counts
    nodes_count_json = json.dumps({node: data.get('count', 0) for node, data in G.nodes(data=True)}, indent=4)
    
    # Get loosely linked nodes from the main node
    loose_threshold = 2
    loosely_linked = [n for n in G.neighbors(main_node) if G.degree(n) <= loose_threshold]
    loosely_linked_json = json.dumps({node: G.degree(node) for node in loosely_linked}, indent=4)
    
    # Create Pyvis Network
    net = Network(height="800px", width="100%", bgcolor="#ffffff", font_color="black", notebook=False, select_menu=True)
    net.barnes_hut(gravity=-20000, central_gravity=0.3, spring_length=250, spring_strength=0.001)
    
    # Add nodes with attributes
    for node, attr in G.nodes(data=True):
        count = attr.get('count', 0)
        net.add_node(node, title=f"Term: {node}<br>Frequency: {count}<br>Connections: {G.degree(node)}", value=count)
    
    # Add edges with weights
    for u, v, attr in G.edges(data=True):
        weight = attr.get('weight', 1)
        net.add_edge(u, v, value=weight, title=f"Co-occurrence: {weight}")
    
    # Set node sizes based on term frequency
    max_count = max([data.get('count', 1) for _, data in G.nodes(data=True)])
    min_size, max_size = 10, 50
    
    for node in net.nodes:
        node_id = node.get('id')
        if node_id in G.nodes:
            count = G.nodes[node_id].get('count', 1)
            size = min_size + (count / max_count) * (max_size - min_size)
            node.update({"size": size})
    
    # Set styling options
    net.set_options("""
    var options = {
      "nodes": {
        "borderWidth": 2,
        "scaling": {"min": 10, "max": 50},
        "color": {"border": "#2B7CE9", "background": "#97C2FC"},
        "font": {"size": 16, "face": "arial", "color": "#343434", "align": "center"}
      },
      "edges": {
        "color": {"color": "#848484", "inherit": false},
        "smooth": {"enabled": true, "type": "dynamic"},
        "width": 0.5
      },
      "physics": {
        "barnesHut": {"gravitationalConstant": -20000, "centralGravity": 0.3, "springLength": 250, "springConstant": 0.001},
        "minVelocity": 0.75
      }
    }
    """)
    
    # Generate AI analysis of the network
    prompt = f'''
    I have created a graph showing how terms co-occur in tweets from Twitter data.
    The main term is '{main_node}' which has {main_degree} connections.
    
    These are my terms and their frequencies:
    {nodes_count_json}
    
    These are terms loosely linked to my main term:
    {loosely_linked_json}
    
    Give me an analysis of the key topics and themes in this conversation network.
    Identify any clusters of terms that might represent distinct narratives or topics.
    
    Output in JSON format with:
    - key_themes: list of main themes/topics identified
    - topic_clusters: object with cluster names as keys and relevant terms as values
    - interesting_insights: list of 3-5 observations about the data
    - summary_report: brief analysis of what this term network reveals
    '''
    
    try:
        response = model.generate_content(prompt)
        analysis_text = response.text
        
        # Try to clean up the response to get valid JSON
        if '```json' in analysis_text:
            analysis_text = analysis_text.split('```json')[1].split('```')[0].strip()
        elif '```' in analysis_text:
            analysis_text = analysis_text.split('```')[1].split('```')[0].strip()
        
        analysis_json = json.loads(analysis_text)
    except Exception as e:
        print(f"Error generating AI analysis: {str(e)}")
        analysis_json = {
            "key_themes": ["Analysis not available"],
            "summary_report": "Could not generate analysis due to an error."
        }
    
    # Save and open visualization
    net.write_html(output_file)
    webbrowser.open(output_file)
    print(f"Visualization saved to {output_file}")
    
    return G, analysis_json

def analyze_tweets_cooccurrence(tweets_file, min_term_count=2, max_terms=200, min_edge_weight=2):
    """
    Main function to load tweets and analyze co-occurrences.
    """
    # Load tweets
    df = pd.read_csv(tweets_file)
    print(f"Loaded {len(df)} tweets")
    
    # Run visualization and analysis
    graph, analysis = visualize_term_cooccurrence(
        df, 
        min_term_count=min_term_count,
        max_terms=max_terms,
        min_edge_weight=min_edge_weight
    )
    
    if analysis:
        print("\n--- ANALYSIS SUMMARY ---")
        print("Key themes:")
        for theme in analysis.get('key_themes', []):
            print(f"- {theme}")
            
        print("\nSummary:")
        print(analysis.get('summary_report', 'No summary available'))
    
    return graph, analysis

# Run the analysis on the tweets
if __name__ == "__main__":
    graph, analysis = analyze_tweets_cooccurrence(
        "tweets.csv",
        min_term_count=2,  # Terms must appear at least twice
        max_terms=200,     # Include up to 200 terms
        min_edge_weight=2  # Terms must co-occur at least twice
    )
    
    # Save analysis to file
    if analysis:
        with open("tweet_cooccurrence_analysis.json", "w") as f:
            json.dump(analysis, indent=4, fp=f)
        print("Analysis saved to tweet_cooccurrence_analysis.json")

Loaded 33 tweets
Identified 76 key terms
Main term: 'ww2' with 46 connections
Visualization saved to term_cooccurrence_graph.html

--- ANALYSIS SUMMARY ---
Key themes:
- Historical Comparisons: Drawing parallels between WW2 and current geopolitical situations (particularly Ukraine).
- Escalation Concerns: Anxiety about the potential for WW3 and the use of nuclear weapons.
- Geopolitical Involvement: Discussions regarding the roles of Europe, Russia, USA, and other nations in current conflicts and historical events.
- Political Commentary: Mentions of figures like Trump, Putin, and Churchill, and their potential influence on world events.
- Ideological and Generational Perspectives: Exploring different viewpoints, potentially related to generational experiences and political ideologies.

Summary:
This term network centered around 'ww2' reveals a complex interplay between historical reflection and contemporary concerns. The high frequency of terms related to WW3 and nuclear weapons under