In [4]:
# from data import info_url,posts_url,reels_url,hashtags_url
import pandas as pd
import requests

In [None]:
# collected data from different API calls
info_data = requests.get(info, headers=headers, params=querystring).json()
posts_data = requests.get(post, headers=headers, params=querystring).json()
reels_data = requests.get(reels, headers=headers, params=querystring).json()
hashtags_data = requests.get(
    hastag, headers=headers, params=querystring).json()

# Converting the collected data into DataFrames
info_df = pd.DataFrame(info_data['data'])
posts_df = pd.DataFrame(posts_data['data'])
reels_df = pd.DataFrame(reels_data['data'])
hashtags_df = pd.DataFrame(hashtags_data['data'])

# Save the datasets if needed
info_df.to_csv("info_data.csv", index=False)
posts_df.to_csv("posts_data.csv", index=False)
reels_df.to_csv("reels_data.csv", index=False)
hashtags_df.to_csv("hashtags_data.csv", index=False)

In [None]:
# Remove duplicates
posts_df.drop_duplicates(inplace=True)
reels_df.drop_duplicates(inplace=True)
hashtags_df.drop_duplicates(inplace=True)

In [None]:
# Fill missing values or remove rows with missing data
posts_df.fillna('', inplace=True)
reels_df.fillna('', inplace=True)
hashtags_df.fillna('', inplace=True)

In [None]:
import re


def clean_text(text):
    # Removing URLs and special characters
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text = text.lower()
    return text


posts_df['caption'] = posts_df['caption'].apply(clean_text)
reels_df['caption'] = reels_df['caption'].apply(clean_text)
hashtags_df['hashtags'] = hashtags_df['hashtags'].apply(clean_text)

In [None]:
# Merging all datasets into a single DataFrame
final_data = pd.merge(posts_df[['caption', 'url']], reels_df[[
                      'caption', 'url']], on='url', how='outer')
final_data = pd.merge(
    final_data, hashtags_df[['hashtags', 'url']], on='url', how='outer')

# Combine captions and hashtags for a complete content-based recommendation
final_data['content'] = final_data['caption'] + " " + final_data['hashtags']

# Drop any remaining duplicates or nulls
final_data.drop_duplicates(subset='url', inplace=True)
final_data.dropna(subset=['content'], inplace=True)

In [None]:
# optional because hamne pahle hi nikala hai
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()


def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']


final_data['sentiment'] = final_data['content'].apply(get_sentiment)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizing the text content
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(final_data['content'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Create a function to recommend based on content
def get_recommendations(index, cosine_sim=cosine_sim):
    # Get the pairwise similarity scores of all content
    sim_scores = list(enumerate(cosine_sim[index]))

    # Sort content based on similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar content
    sim_scores = sim_scores[1:11]

    # Return the URLs of the most similar content
    content_indices = [i[0] for i in sim_scores]
    return final_data['url'].iloc[content_indices]

In [None]:
# Test the recommendation system for the first post
print(get_recommendations(0))

In [None]:
import pickle

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save the cosine similarity matrix
with open('cosine_similarity.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

In [1]:
import pandas as pd
from faker import Faker
import random
import numpy as np

In [7]:

# Initialize Faker
fake = Faker()

# Function to create synthetic dataset


def create_synthetic_data(num_samples=1000):
    data = {
        'user_id': [],
        'post_id': [],
        'Username': [],
        'Caption': [],
        'Hashtags': [],
        'Likes': [],
        'Comments': []
    }

    for _ in range(num_samples):
        # Generate user data
        user_id = random.randint(0, 1000)
        post_id = random.randint(0, 1000)
        username = fake.user_name()
        caption = fake.sentence(nb_words=random.randint(5, 15))
        hashtags = ' '.join([fake.word() for _ in range(random.randint(1, 5))])
        likes = random.randint(0, 1000)
        comments = random.randint(0, 200)

        # Append data
        data['user_id'].append(user_id)
        data['post_id'].append(user_id)
        data['Username'].append(username)
        data['Caption'].append(caption)
        data['Hashtags'].append(hashtags)
        data['Likes'].append(likes)
        data['Comments'].append(comments)

    return pd.DataFrame(data)


# Create synthetic dataset
synthetic_data = create_synthetic_data(num_samples=1000)

# Save to CSV
synthetic_data.to_csv('sentiments.csv', index=False)
print(synthetic_data.head())

   user_id  post_id            Username  \
0      617      617          angeladiaz   
1      835      835        wallacediane   
2      554      554  johnsonchristopher   
3      569      569            coxbryan   
4      802      802              msmith   

                                             Caption  \
0  Fall this water will subject stage issue usual...   
1             Language family control teach exactly.   
2                             Become here recognize.   
3  We find according month green able program by ...   
4          Car issue current movie during authority.   

                       Hashtags  Likes  Comments  
0    record his need good whole    517       144  
1       stage animal too of lot    441       182  
2                         story     41       197  
3  might decide pattern provide     41       190  
4                        impact    367       164  


In [8]:
df = pd.read_csv('sentiments.csv')
df.head(1000)

Unnamed: 0,user_id,post_id,Username,Caption,Hashtags,Likes,Comments
0,617,617,angeladiaz,Fall this water will subject stage issue usual...,record his need good whole,517,144
1,835,835,wallacediane,Language family control teach exactly.,stage animal too of lot,441,182
2,554,554,johnsonchristopher,Become here recognize.,story,41,197
3,569,569,coxbryan,We find according month green able program by ...,might decide pattern provide,41,190
4,802,802,msmith,Car issue current movie during authority.,impact,367,164
...,...,...,...,...,...,...,...
995,352,352,daviseric,Dinner whether girl boy course recognize billi...,production main lead example,355,182
996,953,953,shannonserrano,Son computer myself protect attention city cup...,east reduce,611,90
997,485,485,joel35,Summer whole open seat.,anything main news least leader,431,27
998,28,28,nathangalloway,Rock Mr finish sit data in guy up apply recogn...,woman,118,162


In [13]:
# Import necessary libraries
import pandas as pd
from textblob import TextBlob

# Step 1: Load the CSV file
df = pd.read_csv('sentiments.csv')

# Step 2: Data Preprocessing
# Keep only relevant columns (Caption, Hashtags)
df = df[['Caption', 'Hashtags']]

# Remove any missing or NaN values
df.dropna(subset=['Caption', 'Hashtags'], inplace=True)

# Step 3: Sentiment Analysis Function


def analyze_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns a score between -1 and 1


# Step 4: Calculate Sentiment Scores for Caption and Hashtags
df['caption_score'] = df['Caption'].apply(analyze_sentiment)
df['hashtag_score'] = df['Hashtags'].apply(analyze_sentiment)

# Step 5: Combine Scores to Determine Overall Sentiment
df['overall_score'] = (df['caption_score'] + df['hashtag_score']) / 2

# Step 6: Categorize Sentiment


def categorize_sentiment(score):
    if score < 0:
        return 'Negative'
    elif score > 0:
        return 'Positive'
    else:
        return 'Neutral'


df['sentiment'] = df['overall_score'].apply(categorize_sentiment)

# Step 7: Create a new DataFrame with required columns
final_df = df[['Caption', 'Hashtags', 'sentiment', 'overall_score']]

# Step 8: Save to a new CSV file
final_df.to_csv('instagram_reach_with_sentiments.csv', index=False)

print("Sentiment analysis complete! New file saved as 'instagram_sentiments.csv'.")

Sentiment analysis complete! New file saved as 'instagram_sentiments.csv'.


In [15]:
df = pd.read_csv('instagram_reach_with_sentiments.csv')
df.nunique()

Caption          1000
Hashtags          982
sentiment           3
overall_score     332
dtype: int64

In [19]:
df['sentiment'].value_counts()

sentiment
Positive    520
Neutral     248
Negative    232
Name: count, dtype: int64