BERT EMBEDDINGS CLUSTERING

In [None]:
NUMBER_OF_CLUSTERS = 100

In [None]:
!pip3 install torchvision torchaudio

In [None]:
!pip install torch torchvision torchaudio
!pip install transformers
!pip install nltk
!pip install sklearn
!pip install numpy
!pip install joblib
!pip install pandas

In [None]:
!pip install spacy
!pip install -U scikit-learn

In [None]:
!pip install --upgrade -q google-api-python-client google-auth-httplib2 google-auth-oauthlib
!pip install google-generativeai
!python -m spacy download en_core_web_sm

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
import joblib
import nltk
import numpy as np
import pandas as pd
from nltk import pos_tag, word_tokenize, ne_chunk
from nltk.corpus import stopwords
nltk.download('punkt')  # for tokenization
nltk.download('averaged_perceptron_tagger')  # for POS-tagging
nltk.download('maxent_ne_chunker')  # for NER
nltk.download('words')  # for NER
nltk.download('stopwords')  # for NER

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# get media description
# for company
texts = np.load('../data/test_company_tweet_desc.npy', allow_pickle=True)

# for time
# texts = np.load('../data/test_time_tweet_desc.npy', allow_pickle=True)

def train_and_save_cluster():
    # Step 2: Apply KMeans clustering to obtain cluster assignments
    try:
        embeddings_array = np.load('../data/media_bert_embeddings_array.npy') # load if already saved
    except Exception as e:
        embeddings_list = [model(tokenizer.encode(text, return_tensors='pt'))[0][:, 0, :].detach().numpy() for text in texts]
        embeddings_array = torch.cat([torch.from_numpy(embeddings) for embeddings in embeddings_list]).numpy()

    num_clusters = NUMBER_OF_CLUSTERS  # Adjust based on your optimal number of clusters -> maybe 10
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
    bert_clusters = kmeans.fit_predict(embeddings_array)

    joblib.dump(kmeans, '../saved/kmeans_bert_model.pkl')  # save model
    np.save('../saved/bert_clusters.npy', bert_clusters)  # save clusters

    return kmeans


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def extract_keywords(text):
    # Process the text using spaCy
    doc = nlp(text)

    # Filter out stop words and non-content words
    keywords = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]

    return keywords+verbs

In [None]:
def get_cluster_keywords(texts, clusters):

    results = list(zip(texts, clusters))
    cluster_keywords = [[] for _ in range(NUMBER_OF_CLUSTERS)]

    for text, cluster_id in results:
        # Tokenize and POS-tag the text
        tokens = word_tokenize(text['media'])
        pos_tags = pos_tag(tokens)

        # Apply NER using NLTK
        tree = ne_chunk(pos_tags)
        named_entities = [chunk.label() for chunk in tree if hasattr(chunk, 'label')]

        # Extract action verbs
        action_verbs = [word for (word, pos) in pos_tags if pos.startswith("VB")]

        named_entities = [entity.lower() for entity in named_entities]

        keywords = named_entities+action_verbs
        # print(keywords)
        # keywords = extract_keywords(text['media'])
        for keyword in keywords:
            # print("keyword ", keyword)
            cluster_keywords[cluster_id].append(keyword)
            # np.append(cluster_keywords[cluster_id], keyword)
        # break

    return cluster_keywords

In [None]:
kmeans = train_and_save_cluster()

In [None]:
def get_keywords_from_media(kmeans, media_desc:str):

      embedding = model(tokenizer.encode(media_desc, return_tensors='pt'))[0][:, 0, :].detach().numpy()
      current_cluster_id = kmeans.predict(embedding)[0]

      clusters = np.load('../saved/bert_clusters.npy')
      cluster_keywords = get_cluster_keywords(texts, clusters)

      keywords = cluster_keywords[current_cluster_id]
      return keywords

LIKE MAPPING

In [None]:
def get_keywords_from_likes(likes):

    likes_keyword_mapping = pd.read_csv('../data/likes_keywords_mapping.csv')
    closest_index = (likes_keyword_mapping['likes'] - likes).abs().idxmin()

    return eval(likes_keyword_mapping.loc[closest_index]['keywords'])

Check the results from this approach

In [None]:
# keywords = ['hey', 'mellow', 'tello']
text = texts[0]
company = text['company']
username = text['username']
like = 100

like_mappings = get_keywords_from_likes(like)
keywords = get_keywords_from_media(kmeans, text['media'])

# prompt_given_company = f"You are the social-media manager of company '{company}' having twitter username '{username}' and you have the following keywords {str(k)}, you have write a tweet in the same format as the previous tweets of your company using the given keywords  so that it gets atleast {like} likes"

prompt_given_company = f"As the social media manager for '{company}' (Twitter: @{username}), create a tweet using the following keywords: {str(keywords)}. Craft a message that aligns with our brand and is likely to receive at least {like} likes."

prompt_given_company

In [None]:
!pip install gensim

In [None]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.rom_pretrained('bert-base-uncased')

def get_word2vec_model(sentences):
  return Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

model = get_word2vec_model([keywords, like_mappings])

def get_cosine_similarity(word1, word2): # using bert embeddings
    # Use BERT to find embeddings for each word
    # word1_embedding = model(tokenizer.encode(word1, return_tensors='pt'))[0][:,0,:].detach().numpy()
    # word2_embedding = model(tokenizer.encode(word2, return_tensors='pt'))[0][:,0,:].detach().numpy()

    # vectorizer = CountVectorizer().fit_transform([word1, word2])
    try:
        if word1.lower()==word2.lower(): # same words, similarity is 1
          return 1.0

        vec1 = model.wv[word1]
        vec2 = model.wv[word2]
        similarity = cosine_similarity([vec1], [vec2])[0, 0]
        return similarity
    except KeyError:
        return 0.0  # Return 0 if either word is not in the vocabulary

def get_top_k_similar_words(list1, list2, k=1):
    # Create matrices of all pairwise similarities
    matrix = np.zeros((len(list1), len(list2)))
    for i, word1 in enumerate(list1):
        for j, word2 in enumerate(list2):
            matrix[i, j] = get_cosine_similarity(word1, word2)

    # Get the indices of the top-k similarities
    indices = np.argpartition(matrix, -k, axis=None)[-k:]
    top_k_indices = np.unravel_index(indices, matrix.shape)

    # Extract the top-k pairs and their similarities
    top_k_pairs = [(list1[i], list2[j], matrix[i, j]) for i, j in zip(*top_k_indices)]

    # Filter to include only pairs with similarity score less than 1.0
    top_k_pairs = [(word1, word2, similarity) for word1, word2, similarity in top_k_pairs if similarity < 1.00]

    return top_k_pairs



TESTING

In [None]:
# Install the client library and import necessary modules.
# !pip install google-generativeai
import google.generativeai as palm
import base64
import json
import pprint

In [None]:
import os.path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow

SCOPES = ['https://www.googleapis.com/auth/generative-language.tuning']

def load_creds():
    """Converts `oauth-client-id.json` to a credential object.

    This function caches the generated tokens to minimize the use of the
    consent screen.
    """
    creds = None
    # The file token.json stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'oauth-client-id.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return creds

In [None]:
import pprint
import google.generativeai as palm

creds = load_creds()

palm.configure(credentials=creds)

print('Available base models:', [m.name for m in palm.list_models()])
print('My tuned models:', [m.name for m in palm.list_tuned_models()])

In [None]:
model_name = 'models/text-bison-001'
model = palm.get_model(model_name)

In [None]:
model = model_name 
temperature = 0 
candidate_count = 1 
top_k = 40 
top_p = 0.95 
max_output_tokens = 1024 

defaults = {
    'model': model,
    'temperature': temperature,
    'candidate_count': candidate_count,
    'top_k': top_k,
    'top_p': top_p,
    'max_output_tokens': max_output_tokens,
}

In [None]:
def get_response(prompt, defaults):
    """Returns the response from the model."""
    response = palm.generate_text(**defaults,prompt=prompt)
    
    if len(response.candidates) == 0:
        return ' '
    return response.candidates[0]['output']

In [None]:
prompt_template = "As the social media manager for '{company}' (Twitter: @{username}), create a tweet using the following keywords: {keywords}. Craft a message that aligns with our brand and is likely to receive at least {like} likes."

In [None]:
generated_and_actual[1]

In [None]:
import numpy as np
import pandas as pd
test_tweet = np.load('../data/test_company_tweet_desc.npy', allow_pickle=True)

def run_test(test_type='time'):
    generated_and_actual = []

    for i in range( len(test_tweet) ):
        try:
            tweet = test_tweet[i]
            like = tweet['likes']
            company = tweet['company']
            date = tweet['date']
            media_desc = tweet['media']
            keywords = extract_keywords(media_desc)
            username = tweet['username']
            like_mappings = get_keywords_from_likes(like)

            top_k = get_top_k_similar_words(keywords, like_mappings, k=10)

            keywords = keywords + [ tup[1] for tup in top_k ]
            prompt = prompt_template.format(company=company, date=date, like=like, keywords=str(keywords), username=username)
            generated_response = get_response(prompt, defaults)

            generated_and_actual.append({
                'id': tweet['id'],
                'generated': generated_response
            })

        except Exception as e:
            print(f"Error: {e}. index: {i}")
            
    df = pd.DataFrame(generated_and_actual)
    df.to_csv(f'../saved/test_{test_type}_final_submit_result.csv', index=False)


In [None]:
run_test('company')