In [1]:
from faker import Faker
import random
import pandas as pd

<h3> Content Based Filtiring </h3>

In [16]:
# Initialize Faker
fake = Faker()

# Function to create synthetic dataset with logical relationships


def create_synthetic_data_with_logic(num_samples=1000):
    data = {
        'user_id': [],            # Unique identifier for each user
        'post_id': [],            # Unique identifier for each post
        'Username': [],           # Username of the user
        'Caption': [],            # Caption or content of the post
        'Hashtags': [],           # Hashtags in the post
        'Likes': [],              # Number of likes on the post
        'Comments': [],           # Number of comments on the post
        'Comment_Text': [],       # Text of comments
        'Sentiment_Score': [],    # Sentiment score of comments or post
        'Sentiment_Label': [],    # Sentiment label: positive, neutral, or negative
        'Engagement_Score': []    # Engagement score based on likes and comments
    }

    # Sentiment mapping for random generation
    sentiment_mapping = {
        0: "Negative",
        1: "Neutral",
        2: "Positive"
    }

    for _ in range(num_samples):
        # Generate user data
        user_id = random.randint(0, 1000)
        post_id = random.randint(0, 10000)
        username = fake.user_name()
        post_length = random.randint(5, 150)
        post_content = fake.sentence(nb_words=post_length)
        hashtags_count = random.randint(1, 5)
        hashtags = ' '.join([fake.word() for _ in range(hashtags_count)])

        # Simulate post sentiment based on length and hashtags
        sentiment_score = round(random.uniform(-1, 1), 2)

        # More hashtags and longer posts tend to have more engagement
        if post_length > 100 or hashtags_count > 3:
            # Increase sentiment for lengthy posts
            sentiment_score = max(sentiment_score, 0.5)

        sentiment_label = sentiment_mapping[min(
            max(int((sentiment_score + 1) * 1.5), 0), 2)]

        # Likes and comments depend on the sentiment and number of hashtags
        likes = random.randint(0, 500) + \
            int(200 * sentiment_score) + hashtags_count * 50
        comments = random.randint(
            0, 100) + int(50 * sentiment_score) + hashtags_count * 10

        comment_text = fake.sentence(nb_words=random.randint(5, 20))

        # Engagement score: weighted sum of likes and comments (likes more important)
        engagement_score = likes * 0.7 + comments * 0.3

        # Append data
        data['user_id'].append(user_id)
        data['post_id'].append(post_id)
        data['Username'].append(username)
        data['Caption'].append(post_content)
        data['Hashtags'].append(hashtags)
        data['Likes'].append(likes)
        data['Comments'].append(comments)
        data['Comment_Text'].append(comment_text)
        data['Sentiment_Score'].append(sentiment_score)
        data['Sentiment_Label'].append(sentiment_label)
        data['Engagement_Score'].append(engagement_score)

    return pd.DataFrame(data)


# Create synthetic dataset with logic
synthetic_data = create_synthetic_data_with_logic(num_samples=1000)

# Save to CSV
synthetic_data.to_csv('content_with_logic.csv', index=False)

# Display the first few rows of the dataset
synthetic_data.head()

Unnamed: 0,user_id,post_id,Username,Caption,Hashtags,Likes,Comments,Comment_Text,Sentiment_Score,Sentiment_Label,Engagement_Score
0,273,9776,anthony09,Challenge action structure economy training st...,both stand hope,602,90,Decade phone opportunity several deep.,0.93,Positive,448.4
1,319,9159,vsimmons,Red itself turn sign agent base office drive o...,account soon notice strategy,618,146,Training bit girl center body style cut story ...,0.5,Positive,476.4
2,702,1359,fbauer,Crime born whatever place course knowledge rec...,significant than born sound,327,108,Raise pattern development full protect ground.,0.5,Positive,261.3
3,682,9302,kimberly38,Million modern rather five population culture ...,level question,265,-13,Left make leg face hot fly market even seat ho...,-0.98,Negative,181.6
4,238,7434,lozanozachary,Election reason as specific politics really wi...,minute keep little,372,22,World former song no beyond few three exist re...,-0.52,Negative,267.0


In [17]:
synthetic_data['Caption'][2]

'Crime born whatever place course knowledge recently different but rich up about PM.'

<h3> Collaborative Filtring </h3>

In [4]:
import random
import pandas as pd
from faker import Faker

# Initialize Faker
fake = Faker()

# Parameters
num_users = 100
num_posts = 500

# Sample data lists for captions and hashtags (with logical associations)
caption_hashtag_pairs = {
    "Enjoying the sunset": ["#sunset", "#evening", "#nature"],
    "Morning vibes": ["#morning", "#freshstart", "#sunrise"],
    "Love this beach": ["#beach", "#ocean", "#sun"],
    "Hiking adventures": ["#hiking", "#outdoors", "#adventure"],
    "Coffee time": ["#coffee", "#morning", "#caffeine"],
    "Feeling blessed": ["#blessed", "#grateful", "#thankful"],
    "Weekend getaway": ["#weekend", "#travel", "#getaway"],
    "Road trip": ["#roadtrip", "#adventure", "#travel"],
    "Exploring the mountains": ["#mountains", "#hiking", "#nature"],
    "City lights": ["#city", "#nightlife", "#urban"],
    "Healthy living": ["#healthyliving", "#fitness", "#wellness"],
    "Family time": ["#family", "#love", "#qualitytime"],
    "Throwback to summer": ["#summer", "#memories", "#throwback"],
    "Dinner with friends": ["#dinner", "#friends", "#goodtimes"],
    "Workout motivation": ["#workout", "#fitness", "#motivation"],
    "Chasing waterfalls": ["#waterfalls", "#nature", "#hiking"],
    "Sunset views": ["#sunset", "#views", "#sky"],
    "Beach day": ["#beach", "#sun", "#sand"],
    "Healthy breakfast": ["#breakfast", "#healthy", "#food"],
    "Travel goals": ["#travel", "#adventure", "#explore"],
    "I am feeling very sad": ["#sad", "#low", "#feel"],
    "It makes me sad": ["#depression", "#stress", "#trauma"],
    "The sad fact is that he's lost his touch": ["#lost", "#touch", "#sad"],
    "They looked at her with sad, anxious faces": ["#anxious", "#sad", "#faces"],
    "I am feeling very happy": ["#happy", "#joy", "#feel"],
    "A sad day for us all": ["#sad", "#day", "#all"],
    "Sad to say, she never lived to see it": ["#lived", "#sad"]
}

# Generate users with logical engagement patterns
users = []
for _ in range(num_users):
    users.append({
        'user_id': fake.unique.random_int(min=1, max=10000),
        'user_name': fake.user_name(),
        'follower_count': random.randint(100, 10000),
        'following_count': random.randint(50, 5000),
        'location': fake.city(),
        'age_group': random.choice(['18-24', '25-34', '35-44', '45-54']),
        'gender': random.choice(['male', 'female']),
        # Interaction frequency
        'activity_level': random.choice(['low', 'medium', 'high'])
    })

# Generate posts
posts = []
for _ in range(num_posts):
    user = random.choice(users)
    caption = random.choice(list(caption_hashtag_pairs.keys()))
    # Get corresponding hashtags
    hashtags = ' '.join(caption_hashtag_pairs[caption])

    posts.append({
        'post_id': fake.unique.random_int(min=1, max=10000),
        'user_id': user['user_id'],
        'user_name': user['user_name'],
        'caption': caption,
        'image_url': fake.image_url(),
        'timestamp': fake.date_time_this_year(),
        'interaction': random.choice([0, 1]),  # Randomly simulate like/unlike
        'hashtags': hashtags,  # Use corresponding hashtags
        'location': user['location'],  # Use the same location as the user
        # Randomly select post type
        'post_type': random.choice(['image', 'video'])
    })

# Convert to DataFrames
df_users = pd.DataFrame(users)
df_posts = pd.DataFrame(posts)

# Merge user and post data to form a single dataset
df_combined = pd.merge(df_posts, df_users, on='user_id', how='left')

# Save to CSV
df_combined.to_csv('instagram_recommendation_data.csv', index=False)

# Display a preview of the generated data
print(df_combined.head())

   post_id  user_id     user_name_x              caption  \
0     7986     7007  peggyrodriguez      It makes me sad   
1     4689     9537      younghenry  Throwback to summer   
2     2618     1783  crawfordrobert       Healthy living   
3     1726     8661     adamsjavier       Healthy living   
4     6017     7395     catherine83    Healthy breakfast   

                        image_url           timestamp  interaction  \
0  https://dummyimage.com/869x566 2024-04-16 20:05:14            1   
1  https://dummyimage.com/668x857 2024-08-13 07:08:31            1   
2   https://picsum.photos/906/852 2024-01-15 12:59:22            0   
3   https://dummyimage.com/51x700 2024-02-27 01:09:01            0   
4    https://picsum.photos/418/35 2024-05-05 10:14:24            1   

                            hashtags     location_x post_type     user_name_y  \
0        #depression #stress #trauma      West Alan     video  peggyrodriguez   
1       #summer #memories #throwback   South Angela     

In [3]:
df_combined.head()

Unnamed: 0,post_id,user_id,user_name_x,caption,image_url,timestamp,interaction,hashtags,location_x,post_type,user_name_y,follower_count,following_count,location_y,age_group,gender,activity_level
0,7262,5283,jessica17,Son culture necessary stage down.,https://placekitten.com/773/310,2024-02-25 03:28:57,0,foreign every oil perform,Lake Ryan,image,jessica17,7197,4414,Lake Ryan,25-34,male,medium
1,8200,6518,kevin49,While value spring or.,https://dummyimage.com/604x314,2024-08-31 01:44:31,1,fine million,Sarahtown,image,kevin49,525,2452,Sarahtown,45-54,female,high
2,6549,6518,kevin49,Large class present describe activity.,https://picsum.photos/917/83,2024-01-27 04:50:42,1,everybody evening voice,Sarahtown,video,kevin49,525,2452,Sarahtown,45-54,female,high
3,9340,3720,fernando74,Fill might worry hit scene will.,https://placekitten.com/539/453,2024-05-03 18:39:11,0,include detail argue,North Beverlychester,video,fernando74,3880,3900,North Beverlychester,35-44,male,low
4,2293,7146,jessica86,Focus fall theory perhaps relationship around ...,https://picsum.photos/514/716,2024-06-29 18:17:58,0,happen rate,East Sonya,image,jessica86,2564,3899,East Sonya,35-44,female,low
