In [18]:
import numpy as np
from math import sqrt
import pandas as pd
import re
import os
from nltk.corpus import stopwords
from nltk import pos_tag
import string
from langdetect import detect
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances

In [19]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Prabha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Removing non-english tweets

In [20]:
def detect_lang(text):
    try:
        lang = detect(text)
    except:
        lang = 'unknown'
    return lang
def is_english(text):
    english_pattern = re.compile(r'^[a-zA-Z0-9!@#$%^&*()_+=\-[\]{}|;:",.<>?`~\s]+$')
    return bool(english_pattern.findall(text))

# Hashtag Extraction

In [21]:
def extract_hashtag(df):
    df['Hashtag'] = df['Text'].apply(
        lambda x: [hashtag.lower() for hashtag in re.findall(r"#(\w+)", x)]
    )


# Tokenization and POS tagging

In [22]:
# Define the lemmatize function
def lemmatize(word_list):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in word_list]

# Defining the tokenize function
def tokenize(df):
    tokenizer = TweetTokenizer()
    lemmatizer = WordNetLemmatizer()
    # Ensures all rows in 'Text' are strings, replacing NaN or invalid values
    df['Text'] = df['Text'].fillna('').astype(str)
    # Tokenization of the tweets
    df['Tokenized_Tweets'] = df['Text'].apply(lambda x: tokenizer.tokenize(x))
    # Lemmatization of the tokens
    df['Tokenized_Tweets'] = df['Tokenized_Tweets'].apply(lambda x: lemmatize(x))
    # Performing POS tagging on tokenised text
    df['Tokenized_Tweets'] = df['Tokenized_Tweets'].apply(lambda x: pos_tag(x))
    

# Stop words,punctuation,special word removal

In [23]:
def remove_stop_words(pos_tagged_tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token,pos in pos_tagged_tokens if token.lower() not in stop_words and  token.isalpha()]
    filtered_tokens = [token.lower() for token in filtered_tokens if token not in string.punctuation]
    return filtered_tokens


# Vectorization

In [24]:
def generate_bow(sentences):
    dictonary = {}
    for sentence in sentences:
        for word in  sentence:
            dictonary[word] = dictonary.get(word,0)+1
    return dictonary

In [25]:
def count_dict(bag_of_words,tweets_list):
    doc_count={}
    for word in bag_of_words:
        for tweet in tweets_list:
            if word in tweet:
                doc_count[word] = doc_count.get(word,0)+1
    return doc_count

In [26]:
def tf_idf_vector(tweets_list, bag_of_words):
    N = len(tweets_list)
    doc_count = count_dict(bag_of_words, tweets_list)
    result_list = []
    for tweet in tweets_list:
        line = []
        tf_vector = Counter(tweet)
        for word in bag_of_words:
            tf = tf_vector.get(word, 0)
            idf = np.log(N / (1 + doc_count.get(word,0)))
            line.append(tf * idf)
        result_list.append(line)
    return result_list

In [27]:
def one_hot_encoding_vector(tweets_list,bag_of_words):
    matrix = [[0]* len(bag_of_words) for i in range(len(tweets_list))]
    for i in range(len(tweets_list)):
        word_set = set(tweets_list[i])
        for j,word in enumerate(bag_of_words):
            if word in word_set:
                matrix[i][j] = 1
    return matrix

# Similarity Computation

In [28]:
def cosine_similarity(array1, array2):
    dot_product = np.dot(array1, array2)
    magnitude1 = np.linalg.norm(array1)
    magnitude2 = np.linalg.norm(array2)
    
    similarity = dot_product / (magnitude1 * magnitude2)
    
    return similarity

In [29]:
def jaccard_similarity(list1,list2):
    set1 = set(list1)
    set2 = set(list2)
    return len(set1 & set2)/len(set1 | set2 )
def jaccard_similarity_matrix(vector):
    matrix = []
    for list1 in vector:
        temp = []
        for list2 in vector:
            temp.append(jaccard_similarity(list1,list2))
        matrix.append(temp)
    return matrix

In [30]:
def dice_similarity(list1,list2):
    common_count = 0
    for i in range(len(list1)):
        if list1[i] == list2[i]:
            common_count+=1
    return common_count/(2*len(list1))
def dice_similarity_matrix(vector):
    matrix = []
    for list1 in vector:
        temp = []
        for list2 in vector:
            temp.append(dice_similarity(list1,list2))
        matrix.append(temp)
    return matrix

# Clustering

In [31]:
def k_means_clustering(vectors_list, K, max_iters=100):
    vectors = np.array(vectors_list)
    centroids = vectors[np.random.choice(vectors.shape[0], K, replace=False)]
    
    for _ in range(max_iters):
        similarity_matrix = cosine_similarity(centroids, vectors)
        cluster_assignments = np.argmax(similarity_matrix, axis=0)
        new_centroids = np.array([vectors[cluster_assignments == k].mean(axis=0) if np.sum(cluster_assignments == k) > 0 else vectors[np.random.choice(vectors.shape[0])] for k in range(K)])
        if np.array_equal(centroids, new_centroids):
            break

        centroids = new_centroids

    return centroids, cluster_assignments

In [32]:
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

def dbscan(data, eps, min_samples):
    def range_query(data, point, eps):
        neighbors = []
        for i, neighbor in enumerate(data):
            if euclidean_distance(point, neighbor) <= eps:
                neighbors.append(i)
        return neighbors

    def dbscan_recursive(data, point_index, cluster_id):
        neighbors = range_query(data, data[point_index], eps)
        if len(neighbors) < min_samples:
            labels[point_index] = -1  # Mark as noise
        else:
            labels[point_index] = cluster_id
            for neighbor_index in neighbors:
                if labels[neighbor_index] == 0:
                    dbscan_recursive(data, neighbor_index, cluster_id)

    num_points, _ = data.shape
    labels = np.zeros(num_points)

    cluster_id = 0
    for point_index in range(num_points):
        if labels[point_index] == 0:
            cluster_id += 1
            dbscan_recursive(data, point_index, cluster_id)

    return labels

In [33]:
def iterative_clustering(vectors, threshold=0.5):
    assert all(isinstance(vector, np.ndarray) and vector.ndim == 1 for vector in vectors), "All vectors must be 1D arrays"
    clusters = [] 
    cluster_assignments = []  

    for vector in vectors:
        vector_added = False 

        for i, cluster in enumerate(clusters):
            centroid = np.mean(cluster, axis=0) if cluster else np.zeros_like(vector)
            similarity = cosine_similarity(vector, centroid)

            if similarity > threshold:
                clusters[i].append(vector)
                cluster_assignments.append(i) 
                vector_added = True
                break
        if not vector_added:
            clusters.append([vector])
            cluster_assignments.append(len(clusters) - 1) 

    return cluster_assignments, len(clusters)

In [34]:
def most_frequent_words(cluster_texts,k):
    text_lists = [[] for _ in range(k)]
    cluster_sets = []
    for i,cluster_tweets in enumerate(cluster_texts):
        for tweet in cluster_tweets:
            text_lists[i].extend(tweet)
    for i in range(k):
        set = Counter(text_lists[i])
        cluster_sets.append(set)
    return cluster_sets

In [35]:
def generate_word_cloud(word_freq_dict, cluster_number, save_path):
    if len(word_freq_dict) == 0:
        return
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq_dict)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Cluster {cluster_number}')
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()