In [34]:
from sklearn.metrics import silhouette_score
import numpy as np
import emoji
from math import sqrt
import pandas as pd
import re
import networkx as nx
import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk import pos_tag
import string
from langdetect import detect
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from collections import Counter
import operator
from textblob import TextBlob
import matplotlib.patches as mpatches
from matplotlib.patches import FancyArrowPatch
from sklearn.metrics import pairwise_distances
from collections import defaultdict
from scipy.optimize import linear_sum_assignment
import emojis
from PIL import Image, ImageDraw, ImageFont


In [35]:
import import_ipynb
from tweet_prepossesssing_and_clustering import is_english,extract_hashtag,remove_stop_words,lemmatize,tokenize,generate_bow,count_dict,tf_idf_vector,one_hot_encoding_vector,cosine_similarity,jaccard_similarity,dice_similarity,k_means_clustering,most_frequent_words,generate_word_cloud,iterative_clustering

# Cluster filtration

In [36]:
def calculate_silhouette_scores(tfidf_vectors, cluster_assignments):
    n = len(tfidf_vectors)
    cluster_labels = sorted(set(cluster_assignments))
    silhouette_scores = []
    for current_cluster in cluster_labels:
        cluster_indices = [i for i in range(n) if cluster_assignments[i] == current_cluster]
        centroid = np.mean(tfidf_vectors[cluster_indices], axis=0)
        a_values = [pairwise_distances(tfidf_vectors[i].reshape(1, -1), centroid.reshape(1, -1))[0][0]
                    for i in cluster_indices]
        a_avg = np.mean(a_values) if a_values else 0
        b_values = []
        for other_cluster in cluster_labels:
            if other_cluster != current_cluster:
                other_cluster_indices = [i for i in range(n) if cluster_assignments[i] == other_cluster]
                other_cluster_centroid = np.mean(tfidf_vectors[other_cluster_indices], axis=0)
                for i in cluster_indices:
                    b_values.append(pairwise_distances(tfidf_vectors[i].reshape(1, -1), other_cluster_centroid.reshape(1, -1))[0][0])
        b_avg = np.mean(b_values) if b_values else 0
        silhouette_score_cluster = (b_avg - a_avg) / max(a_avg, b_avg) if max(a_avg, b_avg) > 0 else 0
        silhouette_scores.append(silhouette_score_cluster)

    return silhouette_scores

In [37]:
def calculate_cluster_weights(tfidf_vectors, cluster_assignments):
    unique_clusters = np.unique(cluster_assignments)
    cluster_weights = []
    for cluster in unique_clusters:
        cluster_indices = np.where(cluster_assignments == cluster)[0]
        cluster_tfidf = np.sum(tfidf_vectors[cluster_indices], axis=0)
        weight = np.sum(cluster_tfidf) / max(np.count_nonzero(cluster_tfidf), 1)
        cluster_weights.append(weight)
    return cluster_weights

In [38]:
def event_clusters_filter(cluster_quality,cluster_weights,quality_threshold = 0.1,weight_threshold = 4.0):
    alpha = 0.5
    beta = 0.5
    value = alpha * quality_threshold + beta * weight_threshold
    event_clusters = []
    for i in range(len(cluster_quality)):
        if cluster_quality[i] * alpha + cluster_weights[i] * beta >= value:
            event_clusters.append(i)
    return event_clusters

In [39]:
def generate_clusters(tweet_list, cluster_assignment, event_clusters, hashtag_lists):
    cluster_list = [[] for _ in range(len(set(cluster_assignment)))]
    hashtag_counts_list = [{} for _ in range(len(set(cluster_assignment)))]

    for i, (cluster, hashtags) in enumerate(zip(cluster_assignment, hashtag_lists)):
        if cluster in event_clusters:
            cluster_list[cluster].append(tweet_list[i])
            for hashtag in hashtags:
                hashtag_counts_list[cluster][hashtag] = hashtag_counts_list[cluster].get(hashtag, 0) + 1
    cluster_list = [cluster for cluster in cluster_list if len(cluster) > 0]
    hashtag_counts_list = [hashtags for hashtags in hashtag_counts_list if len(hashtags) > 0]
    return cluster_list, hashtag_counts_list

In [40]:
def write_clusters_to_text_and_hashtags(event_clusters, hashtags_dicts, text_file_path, hashtags_file_path):
    with open(text_file_path, 'w', encoding='utf-8') as text_file, open(hashtags_file_path, 'w', encoding='utf-8') as hashtags_file:
        for cluster_index, (cluster, hashtags_dict) in enumerate(zip(event_clusters, hashtags_dicts)):
            # Write tweets to the text file
            text_file.write(f"Cluster {cluster_index + 1}:\n")
            for tweet_list in cluster:
                for tweet in tweet_list:
                    text_file.write(f"{tweet}\n")
                text_file.write("\n")

            hashtags_file.write(f"Cluster {cluster_index + 1} Hashtags:\n")
            for hashtag, frequency in hashtags_dict.items():
                hashtags_file.write(f"{hashtag}: {frequency}\n")
            
            text_file.write("\n\n")
            hashtags_file.write("\n\n")


# Cluster chainning

In [41]:
def read_clusters_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    clusters = []
    current_cluster = []
    current_tweet = []

    for line in lines:
        line = line.strip()
        if line.startswith("Cluster"):
            if current_cluster:
                clusters.append(current_cluster)
            current_cluster = []
        elif not line:
            if current_tweet:
                current_cluster.append(current_tweet)
                current_tweet = []
        else:
            current_tweet.append(line)

    if current_cluster:
        clusters.append(current_cluster)

    return clusters


In [42]:
def read_hashtags_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    hashtags_list = defaultdict(list)
    current_cluster_no = None

    for line in lines:
        line = line.strip()
        if line.startswith("Cluster"):
            current_cluster_no = int(line.split()[1])
        elif line:
            hashtags_list[current_cluster_no].extend(line.split())
    hashtags_dict_list = [{tag: hashtags_list[cluster].count(tag) for tag in set(hashtags_list[cluster])}
                          for cluster in hashtags_list]

    return hashtags_dict_list

In [43]:
def centroid(cluster,bow):
    tf_idf_matrix = np.array(tf_idf_vector(cluster,bow))
    return np.mean(tf_idf_matrix, axis=0)
def calculate_similarity(cluster1, cluster2):
    merged_cluster = cluster1
    merged_cluster.extend(cluster2)
    bow = generate_bow(merged_cluster)
    centroid1 = centroid(cluster1,bow)
    centroid2 = centroid(cluster2,bow)
    return cosine_similarity(centroid1,centroid2)

In [44]:
def textual_similarity(cluster1,cluster2):
    text1 = [item for tweet in cluster1 for item in tweet]
    text2 = [item for tweet in cluster2 for item in tweet]
    return jaccard_similarity(text1,text2)

In [45]:
def create_bipartite_graph(current_clusters, current_hashtags, next_clusters, next_hashtags, threshold):
    similarity_matrix = np.zeros((len(current_clusters), len(next_clusters)))

    for i, current_cluster in enumerate(current_clusters):
        current_hashtag = current_hashtags[i]

        for j, next_cluster in enumerate(next_clusters):
            next_hashtag = next_hashtags[j]

            syntactic_similarity = calculate_similarity(current_cluster, next_cluster)
            semantic_similarity = textual_similarity(current_cluster, next_cluster)
            union_dict = {key: current_hashtag.get(key, 0) + next_hashtag.get(key, 0) for key in
                          set(current_hashtag) | set(next_hashtag)}
            intersection_dict = {key: min(current_hashtag.get(key, 0), next_hashtag.get(key, 0)) for key in
                                 set(current_hashtag) & set(next_hashtag)}
            hashtag_similarity = sum(intersection_dict.values()) / sum(union_dict.values())
            overall_similarity = 0.4 * syntactic_similarity + 0.4 * semantic_similarity + 0.2 * hashtag_similarity
            similarity_matrix[i, j] = overall_similarity

    return similarity_matrix

In [46]:
def apply_hungarian_method(similarity_matrix, num_clusters_first, num_clusters_second):
    optimal_assignment = {}
    graph = np.array(similarity_matrix)
    num_rows, num_cols = graph.shape
    
    for col in range(num_cols):
        sorted_indices = np.argsort(graph[:, col])[::-1]
        
        for i in sorted_indices:
            if len(optimal_assignment.get(i, [])) < 2:
                optimal_assignment.setdefault(i, []).append(col)
                break

    unassigned_rows = set(range(num_rows)) - set(optimal_assignment.keys())
    
    for row in unassigned_rows:
        col = np.argmax(graph[row, :])
        optimal_assignment[row] = [col]
    
    return optimal_assignment


In [47]:
def create_cluster_chains(timestamps, folder_path, threshold):
    current_timestamp = timestamps[0]
    current_text_file = os.path.join(folder_path, current_timestamp, 'event_clusters.txt')
    current_clusters = read_clusters_from_file(current_text_file)
    chains = [{(current_timestamp, cluster_idx + 1)} for cluster_idx, _ in enumerate(current_clusters)]
    clusters_map = []
    for i in range(len(current_clusters)):
        clusters_map.append([i]) 
    for i in range(1, len(timestamps)):
        current_timestamp = timestamps[i - 1]
        next_timestamp = timestamps[i]
        print(f'{current_timestamp} and {next_timestamp}')
        current_text_file = os.path.join(folder_path, current_timestamp, 'event_clusters.txt')
        current_hashtag_file = os.path.join(folder_path, current_timestamp, 'event_hashtags.txt')
        current_clusters = read_clusters_from_file(current_text_file)
        current_hashtag = read_hashtags_from_file(current_hashtag_file)
        next_text_file = os.path.join(folder_path, next_timestamp, 'event_clusters.txt')
        next_hashtag_file = os.path.join(folder_path, next_timestamp, 'event_hashtags.txt')
        next_hashtag = read_hashtags_from_file(next_hashtag_file)
        next_clusters = read_clusters_from_file(next_text_file)
        graph = create_bipartite_graph(current_clusters, current_hashtag, next_clusters, next_hashtag, threshold)
        optimal_assignment = apply_hungarian_method(graph, len(current_clusters), len(next_clusters))
        temp_map = [set() for _ in range(len(next_clusters))]

        for i in optimal_assignment.keys():
            for j in optimal_assignment[i]:
                for k in clusters_map[i]:
                    temp_map[j].add(k)
                    chains[k].add((next_timestamp,j+1))
        clusters_map = temp_map
    return chains

In [48]:
def create_clusters_dict(folder_path):
    clusters_dict = {}

    folders = sorted(os.listdir(folder_path)) 
    for i in range(len(folders)-1):
        current_folder = folders[i]
        next_folder = folders[i+1]

        current_file_path = os.path.join(folder_path, current_folder, 'event_clusters.txt')
        next_file_path = os.path.join(folder_path, next_folder, 'event_clusters.txt')

        current_clusters = read_clusters_from_file(current_file_path)
        next_clusters = read_clusters_from_file(next_file_path)

        clusters_dict[current_folder] = current_clusters
        clusters_dict[next_folder] = next_clusters

    return clusters_dict

In [49]:
def write_cluster_chains(cluster_chains, output_folder_path, timestamp_clusters_dict):
    output_folder_path = os.path.join(output_folder_path, "cluster_chains")
    os.makedirs(output_folder_path, exist_ok=True)

    for chain_idx, chain in enumerate(cluster_chains):
        chain_folder_path = os.path.join(output_folder_path, f"cluster_chain_{chain_idx + 1}")
        os.makedirs(chain_folder_path, exist_ok=True)

        for index,( timestamp, cluster_idx) in enumerate(chain):
            clusters = timestamp_clusters_dict[timestamp]
            if 0 <= cluster_idx <= len(clusters):
                cluster = clusters[cluster_idx - 1]

                cluster_folder_path = os.path.join(chain_folder_path, f"cluster_{index + 1}")
                os.makedirs(cluster_folder_path, exist_ok=True)
                output_file_path = os.path.join(cluster_folder_path, f"{index+ 1}.txt")

                with open(output_file_path, "w") as text_file:
                    for tweet_list in cluster:
                        for tweet in tweet_list:
                            text_file.write(f"{tweet} ")
                        text_file.write("\n")

# Sentiment analysis and visualization

In [50]:
def get_folder_names(folder_path):
    entries = os.listdir(folder_path)
    folder_names = [entry for entry in entries if os.path.isdir(os.path.join(folder_path, entry))]

    return folder_names

In [51]:
def analyze_sentiment(cluster):
    analyzer = SentimentIntensityAnalyzer()
    sentiments = {'positive': 0, 'neutral': 0, 'negative': 0}

    for tweet in cluster:
        sentiment_scores = analyzer.polarity_scores(tweet)
        compound_score = sentiment_scores['compound']

        if compound_score >= 0.05:
            sentiments['positive'] += 1
        elif compound_score > -0.05 and compound_score < 0.05:
            sentiments['neutral'] += 1
        else:
            sentiments['negative'] += 1

    return sentiments


In [52]:
def plot_pie_chart(sentiments, save_path):
    labels = sentiments.keys()
    sizes = [sentiments[label] for label in labels]

    plt.figure(figsize=(8, 8))
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=['green', 'yellow', 'red'])
    plt.title('Sentiment Analysis')
    plt.savefig(save_path)
    plt.close()

In [53]:
def generate_emoji_image(sentiments, save_path):
    sentiment = max(sentiments.items(), key = operator.itemgetter(1))[0]
    folder = os.getcwd()
    emoji_mapping = {
        'positive': f'{folder}/positive/emoji.png', 
        'neutral': f'{folder}/neutral/emoji.png', 
        'negative': f'{folder}/negative/emoji.png'
    }

    emoji_path = emoji_mapping.get(sentiment)
    emoji_image = Image.open(emoji_path)
    emoji_image.save(save_path)

In [54]:
def create_word_cloud(cluster, save_path):
    text = ' '.join(cluster)
    wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110, background_color='white').generate(text)

    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.savefig(save_path)
    plt.close()


In [55]:
def analyze_and_visualize_clusters(cluster_chains_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for chain_name in os.listdir(cluster_chains_folder):
        chain_folder = os.path.join(cluster_chains_folder, chain_name)
        output_chain_folder = os.path.join(output_folder, chain_name)
        os.makedirs(output_chain_folder, exist_ok=True)

        for cluster_name in os.listdir(chain_folder):
            cluster_folder_path = os.path.join(chain_folder, cluster_name)

            if os.path.isdir(cluster_folder_path):
                cluster_tweets = []
                for file_name in os.listdir(cluster_folder_path):
                    file_path = os.path.join(cluster_folder_path, file_name)

                    if os.access(file_path, os.R_OK) and file_name.endswith('.txt'):
                        with open(file_path, 'r') as cluster_file:
                            cluster_tweets.extend([line.strip() for line in cluster_file])

                sentiments = analyze_sentiment(cluster_tweets)

                pie_chart_folder = os.path.join(output_chain_folder, cluster_name)
                os.makedirs(pie_chart_folder, exist_ok=True)
                pie_chart_path = os.path.join(pie_chart_folder, f'{cluster_name}_sentiment_pie_chart.png')
                plot_pie_chart(sentiments, pie_chart_path)

                emoji_image_folder = os.path.join(output_chain_folder, cluster_name)
                os.makedirs(emoji_image_folder, exist_ok=True)
                emoji_image_path = os.path.join(emoji_image_folder, f'{cluster_name}_sentiment_emoji.png')
                generate_emoji_image(sentiments, emoji_image_path)

                word_cloud_folder = os.path.join(output_chain_folder, cluster_name)
                os.makedirs(word_cloud_folder, exist_ok=True)
                word_cloud_path = os.path.join(word_cloud_folder, f'{cluster_name}_word_cloud.png')
                create_word_cloud(cluster_tweets, word_cloud_path)
