In [120]:
import re
import numpy as np
import pandas as pd
import nltk
import pickle
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [121]:
def clean_text(text):
    """
    Cleans input text by removing URLs, non-alphabetic characters, 
    converting to lowercase, and removing stopwords.
    """
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [122]:
# Load dataset
df = pd.read_csv('content_with_logic.csv')

# Combine Caption, Hashtags, and Comments into one field
df['content'] = df['Caption'].fillna(
    '') + ' ' + df['Hashtags'].fillna('') + ' ' + df['Comment_Text'].fillna('')

# Clean content
df['clean_content'] = df['content'].apply(clean_text)

# Tokenize cleaned content
df['tokenized_content'] = df['clean_content'].apply(lambda x: x.split())

In [123]:
# Load dataset
df = pd.read_csv('content_with_logic.csv')

# Combine Caption, Hashtags, and Comments into one field
df['content'] = df['Caption'].fillna(
    '') + ' ' + df['Hashtags'].fillna('') + ' ' + df['Comment_Text'].fillna('')

# Clean content
df['clean_content'] = df['content'].apply(clean_text)

# Tokenize cleaned content
df['tokenized_content'] = df['clean_content'].apply(lambda x: x.split())

In [124]:
# Train Word2Vec model
word2vec_model = Word2Vec(
    sentences=df['tokenized_content'], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(df['tokenized_content'], total_examples=len(
    df['tokenized_content']), epochs=10)

# Get average Word2Vec embeddings for each post


def get_avg_word2vec(tokens):
    vectors = [word2vec_model.wv[word]
               for word in tokens if word in word2vec_model.wv]
    if len(vectors) == 0:
        return np.zeros(100)  # Return zero vector if no words are found
    return np.mean(vectors, axis=0)


df['content_embeddings'] = df['tokenized_content'].apply(get_avg_word2vec)

In [125]:
# Fit TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['clean_content'])

In [126]:
# Ensure Engagement_Score and Sentiment_Score columns exist
df['Engagement_Score'] = pd.to_numeric(
    df.get('Engagement_Score', 0), errors='coerce').fillna(0)
df['Sentiment_Score'] = pd.to_numeric(
    df.get('Sentiment_Score', 0), errors='coerce').fillna(0)

# Calculate weighted score (70% engagement, 30% sentiment)
df['weighted_score'] = df['Engagement_Score'] * \
    0.7 + df['Sentiment_Score'] * 0.3

In [127]:
def recommend_posts(user_id, user_profiles):
    """
    Recommend posts for a user based on content similarity and weighted scores.
    """
    if user_id in user_profiles:
        user_profile_vector = user_profiles[user_id].reshape(1, -1)

        # Calculate cosine similarity between user profile and posts
        cosine_similarities = cosine_similarity(
            user_profile_vector, np.array(df['content_embeddings'].tolist()))

        # Rank posts by cosine similarity
        recommended_post_indices = np.argsort(
            cosine_similarities[0])[-10:]  # Top 10 recommendations
        recommended_posts = df.iloc[recommended_post_indices]

        # Re-rank posts based on weighted score
        final_recommendations = recommended_posts.sort_values(
            by='weighted_score', ascending=False)
        return final_recommendations[['post_id', 'Caption', 'Hashtags', 'weighted_score']]
    else:
        print(f"No profile found for user_id {user_id}")
        return None

In [128]:
def precision_at_k(recommended_posts, true_interactions, k):
    relevant = set(true_interactions[:k])
    recommended = set(recommended_posts[:k])
    return len(relevant.intersection(recommended)) / k


def recall_at_k(recommended_posts, true_interactions, k):
    relevant = set(true_interactions[:k])
    recommended = set(recommended_posts[:k])
    return len(relevant.intersection(relevant)) / len(relevant)


def f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

In [131]:
# Save user profiles for future use
with open('content.pkl', 'wb') as f:
    pickle.dump(user_profiles, f)

# Load saved profiles if needed
with open('content.pkl', 'rb') as f:
    user_profiles = pickle.load(f)

In [138]:
recommend_posts(552, user_profiles)

Unnamed: 0,post_id,Caption,Hashtags,weighted_score
495,1406,Arm provide music letter local record else abi...,technology it team pull,355.813
571,7313,Evening radio professional again interview sam...,trip,240.18
741,8174,Number radio pay western sound section authori...,though education rich industry,226.81
694,8209,Mind east attorney very industry manager week ...,difficult bank item prepare seek,207.21
302,4227,Generation memory economy agreement finish cho...,positive peace machine,207.192
668,5154,Occur material wish wrong once hard wind despi...,attention manage,205.505
194,8208,Near cut week business service position contai...,hear agree,204.185
397,3873,Hospital mind live late along benefit expert s...,rich short including hand,176.13
401,3377,Project sing minute manager operation region s...,former line,143.562
784,8011,Through give class wear by oil control five te...,agent,137.063
