In [2]:
import pandas as pd
import numpy as np

text_inputs = [
    'hey i like really like coding',
    'i dont like cooking at all',
    'i woke up and choose violence',
    'im just bad using snapchat',
    'twitter is going to die',
    'elon aint even bad',
    'i hope the new avatar movies arent made too long'
]

text_inputs = [item for item in text_inputs if not isinstance(item, int)]

# Create a DataFrame with a single column named "post"
df = pd.DataFrame({'post': text_inputs})

# Rename the default index with a custom name
df.index = pd.Index(range(1, len(df) + 1), name='S.No')
# df.style.set_properties(subset=['post'], **{'text-align': 'left'})
df

Unnamed: 0_level_0,post
S.No,Unnamed: 1_level_1
1,hey i like really like coding
2,i dont like cooking at all
3,i woke up and choose violence
4,im just bad using snapchat
5,twitter is going to die
6,elon aint even bad
7,i hope the new avatar movies arent made too long


In [3]:
df['Tags'] = df['post'].apply(lambda x: [[word for word in x.split()]])

df['Tags'] = df['Tags'].apply(lambda x: x[0])

In [4]:
df

Unnamed: 0_level_0,post,Tags
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1
1,hey i like really like coding,"[hey, i, like, really, like, coding]"
2,i dont like cooking at all,"[i, dont, like, cooking, at, all]"
3,i woke up and choose violence,"[i, woke, up, and, choose, violence]"
4,im just bad using snapchat,"[im, just, bad, using, snapchat]"
5,twitter is going to die,"[twitter, is, going, to, die]"
6,elon aint even bad,"[elon, aint, even, bad]"
7,i hope the new avatar movies arent made too long,"[i, hope, the, new, avatar, movies, arent, mad..."


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

input_text = input("Enter your text: ")
vectorizer = CountVectorizer(stop_words = list(ENGLISH_STOP_WORDS))
    
X = vectorizer.fit_transform([input_text])

feature_names = vectorizer.get_feature_names_out()

print(f'feature_names: {feature_names}')

top_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1][:10]]
print(f'\n top_tags: {top_keywords}')


Enter your text: i like drinking coffee
feature_names: ['coffee' 'drinking' 'like']

 top_tags: [array([['like', 'drinking', 'coffee']], dtype=object)]


In [6]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def generate_context_tags(sentence, n_topics=2, top_words=5, random_state=30):
    # Create a list with a single sentence
    sentences = [sentence]

    # Vectorize the text data
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)

    # Specify the number of topics (n_components) as an integer greater than 0
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=random_state)

    # Fit the model
    lda.fit(X)

    # Transform the test data (your sentence)
    sentence_vectorized = vectorizer.transform([sentence])

    # Get the topic distribution for the sentence
    topic_distribution = lda.transform(sentence_vectorized)[0]

    # Get the feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Get the indices of the top words for each topic
    top_words_indices = topic_distribution.argsort()[::-1][:min(top_words, len(feature_names))]

    # Get the actual words
    top_words = [feature_names[i] for i in top_words_indices]

    return top_words

# Example usage:
sentence = 'I enjoy programming in python'
context_tags = generate_context_tags(sentence, n_topics=3, top_words=4)

print(f"Original Sentence: {sentence}")
print(f"Context Tags: {context_tags}")


Original Sentence: I enjoy programming in python
Context Tags: ['python', 'enjoy', 'programming']


In [7]:
def extract_top_keywords(posts, n_top_keywords=10):
    vectorizer = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS))
    X = vectorizer.fit_transform(posts)
    
    feature_names = vectorizer.get_feature_names_out()
    print(f'feature_names: {feature_names}')

    top_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1][:n_top_keywords]]
    return top_keywords

In [8]:
from pandas.core.common import flatten

df['top_keywords'] = df['post'].apply(lambda x: extract_top_keywords([x])[0])

feature_names: ['coding' 'hey' 'like' 'really']
feature_names: ['cooking' 'dont' 'like']
feature_names: ['choose' 'violence' 'woke']
feature_names: ['bad' 'im' 'just' 'snapchat' 'using']
feature_names: ['die' 'going' 'twitter']
feature_names: ['aint' 'bad' 'elon']
feature_names: ['arent' 'avatar' 'hope' 'long' 'movies' 'new']


In [9]:
df['top_keywords'] = df['top_keywords'].apply(lambda x: list(flatten(x)))

In [10]:
df['context_tags'] = df['post'].apply(generate_context_tags)

In [11]:
df

Unnamed: 0_level_0,post,Tags,top_keywords,context_tags
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,hey i like really like coding,"[hey, i, like, really, like, coding]","[like, really, hey, coding]","[hey, coding]"
2,i dont like cooking at all,"[i, dont, like, cooking, at, all]","[like, dont, cooking]","[cooking, dont]"
3,i woke up and choose violence,"[i, woke, up, and, choose, violence]","[woke, violence, choose]","[choose, violence]"
4,im just bad using snapchat,"[im, just, bad, using, snapchat]","[using, snapchat, just, im, bad]","[im, bad]"
5,twitter is going to die,"[twitter, is, going, to, die]","[twitter, going, die]","[die, going]"
6,elon aint even bad,"[elon, aint, even, bad]","[elon, bad, aint]","[aint, bad]"
7,i hope the new avatar movies arent made too long,"[i, hope, the, new, avatar, movies, arent, mad...","[new, movies, long, hope, avatar, arent]","[avatar, arent]"


In [12]:
def find_intersection(row):
    top_keywords_set = set(row['top_keywords'])
    context_tags_set = set(row['context_tags'])
    intersection = top_keywords_set.intersection(context_tags_set)
    return list(intersection)

In [13]:
df['actual_tags'] = df.apply(find_intersection, axis=1)

In [14]:
df

Unnamed: 0_level_0,post,Tags,top_keywords,context_tags,actual_tags
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,hey i like really like coding,"[hey, i, like, really, like, coding]","[like, really, hey, coding]","[hey, coding]","[coding, hey]"
2,i dont like cooking at all,"[i, dont, like, cooking, at, all]","[like, dont, cooking]","[cooking, dont]","[cooking, dont]"
3,i woke up and choose violence,"[i, woke, up, and, choose, violence]","[woke, violence, choose]","[choose, violence]","[choose, violence]"
4,im just bad using snapchat,"[im, just, bad, using, snapchat]","[using, snapchat, just, im, bad]","[im, bad]","[im, bad]"
5,twitter is going to die,"[twitter, is, going, to, die]","[twitter, going, die]","[die, going]","[die, going]"
6,elon aint even bad,"[elon, aint, even, bad]","[elon, bad, aint]","[aint, bad]","[aint, bad]"
7,i hope the new avatar movies arent made too long,"[i, hope, the, new, avatar, movies, arent, mad...","[new, movies, long, hope, avatar, arent]","[avatar, arent]","[arent, avatar]"


In [29]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation

def extract_top_keywords(posts, n_top_keywords=10):
    vectorizer = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS))
    X = vectorizer.fit_transform(posts)
    
    feature_names = vectorizer.get_feature_names_out()
    print(f'feature_names: {feature_names}')

    top_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1][:n_top_keywords]]
    return top_keywords

def generate_context_tagsss(sentence, n_topics=2, top_words=5, random_state=30):
    # Create a list with a single sentence
    sentences = [sentence]

    # Vectorize the text data
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)

    # Specify the number of topics (n_components) as an integer greater than 0
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=random_state)

    # Fit the model
    lda.fit(X)

    # Transform the test data (your sentence)
    sentence_vectorized = vectorizer.transform([sentence])

    # Get the topic distribution for the sentence
    topic_distribution = lda.transform(sentence_vectorized)[0]

    # Get the feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Get the indices of the top words for each topic
    top_words_indices = topic_distribution.argsort()[::-1][:min(top_words, len(feature_names))]

    # Get the actual words
    top_words = [feature_names[i] for i in top_words_indices]

    return top_words


# Example usage:
post = 'I enjoy coding in python'
context_keywords = generate_context_tags(post)
top_keywords = extract_top_keywords([post])

print(f"Post: {post}")
# print(f"Intersection Tags: {intersection_tags}")
t_keyword = (top_keywords[0]).tolist()
t_keyword = t_keyword[0]
print(t_keyword ,'\t', context_keywords)

# ---------------------- intersection tags generation ------------------------------------------------------
def find_intersection_tags(top_keywords, context_tags):
    # Convert lists to sets
    top_keywords_set = set(top_keywords)
    context_tags_set = set(context_tags)

    # Find the intersection of sets
    intersection_set = top_keywords_set.intersection(context_tags_set)

    # Convert the result back to a list
    intersection_list = list(intersection_set)    
    return intersection_list


a_tag = find_intersection_tags(t_keyword,context_keywords)

a_tag

feature_names: ['coding' 'enjoy' 'python']
Post: I enjoy coding in python
['python', 'enjoy', 'coding'] 	 ['coding', 'enjoy']


['coding', 'enjoy']

In [30]:
def get_posts_with_tags(df, tags_to_check):
    # Convert tags_to_check to a set for efficient membership checking
    tags_set = set(tags_to_check)
    
    # Create a boolean mask indicating rows where any tag is present
    mask = df['actual_tags'].apply(lambda tags: bool(tags_set.intersection(tags)))

    # Filter the DataFrame based on the mask
    filtered_df = df[mask]

    return filtered_df

In [31]:
filtered_posts_df = get_posts_with_tags(df, a_tag)

print(f"Posts with tags {a_tag}:")
print(filtered_posts_df[['actual_tags', 'post']])

Posts with tags ['coding', 'enjoy']:
        actual_tags                           post
S.No                                              
1     [coding, hey]  hey i like really like coding
