BERT EMBEDDINGS CLUSTERING

In [None]:
NUMBER_OF_CLUSTERS = 100

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
import joblib
import nltk
import numpy as np
import pandas as pd
from nltk import pos_tag, word_tokenize, ne_chunk
from nltk.corpus import stopwords
nltk.download('punkt')  # for tokenization
nltk.download('averaged_perceptron_tagger')  # for POS-tagging
nltk.download('maxent_ne_chunker')  # for NER
nltk.download('words')  # for NER
nltk.download('stopwords')  # for NER

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# get media description

texts = np.load('data/tweet_desc.npy')

def train_and_save_cluster():
    # Step 2: Apply KMeans clustering to obtain cluster assignments
    try:
        embeddings_array = np.load('data/media_bert_embeddings_array.npy') # load if already saved
    except Exception as e:
        embeddings_list = [model(tokenizer.encode(text, return_tensors='pt'))[0][:, 0, :].detach().numpy() for text in texts]
        embeddings_array = torch.cat([torch.from_numpy(embeddings) for embeddings in embeddings_list]).numpy()

    num_clusters = NUMBER_OF_CLUSTERS  # Adjust based on your optimal number of clusters -> maybe 10
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
    bert_clusters = kmeans.fit_predict(embeddings_array)

    joblib.dump(kmeans, 'saved/kmeans_bert_model.pkl')  # save model
    np.save('saved/bert_clusters.npy', bert_clusters)  # save clusters

    return kmeans


In [None]:
def get_cluster_keywords(texts, clusters):

    results = list(zip(texts, clusters))
    cluster_keywords = [[] for i in range(NUMBER_OF_CLUSTERS)]

    for text, cluster_id in results:

        # Tokenize and POS-tag the text
        tokens = word_tokenize(text)
        pos_tags = pos_tag(tokens)

        # Apply NER using NLTK
        tree = ne_chunk(pos_tags)
        named_entities = [chunk.label() for chunk in tree if hasattr(chunk, 'label')]
        
        # Extract action verbs
        action_verbs = [word for (word, pos) in pos_tags if pos.startswith("VB")]

        named_entities = [entity.lower() for entity in named_entities]

        keywords = named_entities + action_verbs

        for keyword in keywords:
            cluster_keywords[cluster_id].append(keyword)
            
    return cluster_keywords

In [None]:
kmeans = train_and_save_cluster()

In [None]:
def get_keywords_from_media(kmeans, media_desc:str):
    embedding = model(tokenizer.encode(media_desc, return_tensors='pt'))[0][:, 0, :].detach().numpy()
    current_cluster_id = kmeans.predict(embedding)[0]

    clusters = np.load('saved/bert_clusters.npy')
    cluster_keywords = get_cluster_keywords(texts, clusters)
    keywords = cluster_keywords[current_cluster_id]
    return keywords

COSINE SIMILARITY

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.vectorizers import CountVectorizer

# Load pre-trained BERT model and tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

def get_cosine_similarity(word1, word2): # using bert embeddings
    # Use BERT to find embeddings for each word
    # word1_embedding = model(tokenizer.encode(word1, return_tensors='pt'))[0][:,0,:].detach().numpy()
    # word2_embedding = model(tokenizer.encode(word2, return_tensors='pt'))[0][:,0,:].detach().numpy()

    vectorizer = CountVectorizer().fit_transform([word1, word2])

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(vectorizer)

    # Return the cosine similarity score
    return similarity_matrix[0, 0]

def get_top_k_similar_words(list1, list2, k=1):
    # Initialize a dictionary to store similarity scores
    similarity_scores = {}

    # Iterate through each pair of words in the two lists
    for word1 in list1:
        for word2 in list2:
            # Calculate cosine similarity
            similarity = get_cosine_similarity(word1, word2)
            
            # Store the similarity score in the dictionary
            key = (word1, word2)
            similarity_scores[key] = similarity

    # Sort the similarity scores in descending order
    sorted_scores = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)

    # Get the top-k similar pairs
    top_k_pairs = sorted_scores[:k]

    return top_k_pairs


In [None]:
# Example usage with two lists of keywords
keywords_list1 = ["women", "blue shirt", "vests", "working", "working in factory", "factory"]
keywords_list2 = ["empowerment", "blue shirts", "workforce", "factory", "film production"]

# Specify the value of k (top-k similar pairs)
top_k = 6

# Get the top-k similar pairs
top_k_pairs = get_top_k_similar_words(keywords_list1, keywords_list2, k=top_k)

# Print the top-k similar pairs and their scores
for pair, score in top_k_pairs:
    word1, word2 = pair
    print(f"Similarity between '{word1}' and '{word2}': {score}")


LIKE MAPPING

In [13]:
def get_keywords_from_likes(likes):

    likes_keyword_mapping = pd.read_csv('/content/drive/My Drive/Tech-Mid-Adobe/task2/likes_keywords_mapping.csv')
    closest_index = (likes_keyword_mapping['likes'] - likes).abs().idxmin()

    return eval(likes_keyword_mapping.loc[closest_index]['keywords'])

Check the results from this approach

In [12]:
# keywords = ['hey', 'mellow', 'tello']
# company = 'CNN'
# username = 'CNN'
# like = 100
# prompt_given_company = f"You are the social-media manager of company '{company}' having twitter username '{username}' and you have the following keywords {str(k)}, you have write a tweet in the same format as the previous tweets of your company using the given keywords  so that it gets atleast {like} likes"

prompt_given_company = "As the social media manager for '{company}' (Twitter: @{username}), create a tweet using the following keywords: {keywords}. Craft a message that aligns with our brand and is likely to receive at least {like} likes."

prompt_given_company

"As the social media manager for 'CNN' (Twitter: @CNN), create a tweet using the following keywords: ['hey', 'mellow', 'tello']. Craft a message that aligns with our brand and is likely to receive at least 100 likes."

In [None]:
like_mappings = []
keywords = get_keywords_from_media(kmeans, media_desc)
