In [64]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import nltk

In [66]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
# Combine Caption, Hashtags, and Comment_Text into a single content field
df['content'] = df['Caption'].fillna(
    '') + ' ' + df['Hashtags'].fillna('') + ' ' + df['Comment_Text'].fillna('')

In [68]:
# Clean the text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # remove URLs
    # remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text


df['clean_content'] = df['content'].apply(clean_text)

In [69]:
# Apply TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['clean_content'])

In [70]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [71]:
# Select a user for recommendations
user_id = 100

# Get the user's profile vector and reshape it to be 2D (1, n_features)
user_profile_vector = np.asarray(
    user_profiles[user_id]).reshape(1, -1)  # Reshape to 2D

# Calculate cosine similarities between the user's profile and all post vectors
cosine_similarities = cosine_similarity(
    user_profile_vector, tfidf_matrix)  # Both inputs must be 2D

# Rank the posts based on cosine similarity (the higher the score, the better the match)
recommended_post_indices = np.argsort(
    cosine_similarities[0])[-10:]  # Top 10 recommended posts

# Output the recommended posts
recommended_posts = df.iloc[recommended_post_indices]

In [72]:
# Re-rank posts based on engagement and sentiment
df['weighted_score'] = df['Engagement_Score'] * \
    0.7 + df['Sentiment_Score'] * 0.3
final_recommendations = df.iloc[recommended_post_indices].sort_values(
    by='weighted_score', ascending=False)

In [73]:
df['Hashtags'].head(10)

0    experience across girl expect fight
1                          walk property
2                              cover hit
3      itself throughout thought citizen
4                  speak clear necessary
5              talk offer model continue
6                     cover foreign girl
7                             TV against
8                                    yes
9                     realize raise wall
Name: Hashtags, dtype: object

In [74]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [75]:
def precision_at_k(recommended_posts, true_interactions, k):
    relevant = set(true_interactions[:k])
    recommended = set(recommended_posts[:k])
    return len(relevant.intersection(recommended)) / k


# Calculate Precision@K for a user
precision_at_k(recommended_posts['post_id'],
               test[test['user_id'] == user_id]['post_id'], k=10)

0.0

In [77]:
# Output recommended posts
print("Top recommended posts for user_id = 1:")
recommended_posts[['post_id', 'Caption', 'Hashtags']]

Top recommended posts for user_id = 1:


Unnamed: 0,post_id,Caption,Hashtags
459,835,Successful stuff discover wear big girl his de...,between
814,15,Gun receive knowledge there.,best town today travel style
766,23,Memory knowledge western company son list ever...,performance boy but
85,541,Leave adult much drive dog wrong seat someone ...,chair great business
212,214,Phone that me federal part off economic ago role.,process car
191,0,Fire as assume idea moment.,learn too
261,498,Dog purpose since defense see employee player ...,behavior
381,768,Pretty big manage offer question new Mrs rich ...,future rate
168,457,Very dog finally and goal social worry threat.,give
128,380,Real Mrs doctor side phone goal no practice kn...,behavior dog as today


In [78]:
# Get the top 3 posts with the highest cosine similarity
top_post_indices = cosine_similarities.argsort(
)[0][-3:][::-1]  # Sort in descending order
recommended_posts = df.iloc[top_post_indices]

In [79]:
# Add a weighted score for engagement and sentiment
df['weighted_score'] = 0.7 * df['Engagement_Score'] + 0.3 * df['Sentiment_Score']

# Re-rank the recommended posts based on weighted score
recommended_posts = df.iloc[top_post_indices].sort_values(
    by='weighted_score', ascending=False)

# Output re-ranked posts
print("Re-ranked recommended posts (based on engagement and sentiment):")
recommended_posts[['post_id', 'Caption', 'Hashtags', 'weighted_score']]

Re-ranked recommended posts (based on engagement and sentiment):


Unnamed: 0,post_id,Caption,Hashtags,weighted_score
168,457,Very dog finally and goal social worry threat.,give,749.066
381,768,Pretty big manage offer question new Mrs rich ...,future rate,695.103
128,380,Real Mrs doctor side phone goal no practice kn...,behavior dog as today,643.964


In [80]:
# Function to calculate Precision@K
def precision_at_k(recommended_posts, true_posts, k):
    recommended = recommended_posts[:k]
    relevant = set(true_posts)
    return len(set(recommended) & relevant) / k


# Example: User 1's true interactions (posts they liked or commented on)
true_posts = df[df['user_id'] == 1]['post_id'].values

# Calculate Precision@K for K=3
precision_k = precision_at_k(
    recommended_posts['post_id'].values, true_posts, 3)
print(f"Precision@3 for user_id = 1: {precision_k}")

Precision@3 for user_id = 1: 1.0


In [81]:
import pickle
# Save the user profiles
with open('content.pkl', 'wb') as f:
    pickle.dump(user_profiles, f)
# Load the user profiles
with open('content.pkl', 'rb') as f:
    user_profiles = pickle.load(f)

In [82]:
df.head()

Unnamed: 0,user_id,post_id,Username,Caption,Hashtags,Likes,Comments,Comment_Text,Sentiment_Score,Sentiment_Label,Engagement_Score,content,weighted_score,clean_content
0,1,599,michaelsilva,Turn wish sure garden manage cut movement Mrs ...,experience across girl expect fight,252,176,Effort building national cost claim baby assum...,0.3,Neutral,428,Turn wish sure garden manage cut movement Mrs ...,299.69,turn wish sure garden manage cut movement mrs ...
1,1,565,jchen,Top job who every business hit let with purpose.,walk property,951,103,Moment around hospital require or whole around...,0.86,Positive,1054,Top job who every business hit let with purpos...,738.058,top job every business hit let purpose walk pr...
2,1,592,thurley,Easy cause than whom training cut manager prod...,cover hit,711,168,Run beautiful true thought beat decision neces...,-0.6,Negative,879,Easy cause than whom training cut manager prod...,615.12,easy cause training cut manager production sys...
3,1,96,ujoyce,Several foreign none public treatment college ...,itself throughout thought citizen,426,200,Language cell charge second hour west month ag...,-0.12,Neutral,626,Several foreign none public treatment college ...,438.164,several foreign none public treatment college ...
4,1,858,klinemichelle,Stop particularly reason center leader because...,speak clear necessary,981,71,Letter speak spend suffer Mr player leg.,0.48,Positive,1052,Stop particularly reason center leader because...,736.544,stop particularly reason center leader dream f...
