In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [5]:
# Load dataset
df = pd.read_csv('Instagram_data.csv')
# Fill missing values in 'Caption' and 'Hashtags' columns
df['Caption'] = df['Caption'].fillna('')
df['Hashtags'] = df['Hashtags'].fillna('')
# Combine Caption and Hashtags for vectorization
df['content'] = df['Caption'] + ' ' + df['Hashtags']

In [6]:
df.head(5)

Unnamed: 0,Date,Impressions,From Home,From Hashtags,From Explore,From Other,Saves,Comments,Shares,Likes,Profile Visits,Follows,Conversion Rate,Caption,Hashtags,content
0,2021-12-10,3920,2586,1028,619,56,98,9,5,162,35,2,5.714286,Here are some of the most important data visua...,#finance #money #business #investing #investme...,Here are some of the most important data visua...
1,2021-12-11,5394,2727,1838,1174,78,194,7,14,224,48,10,20.833333,Here are some of the best data science project...,#healthcare #health #covid #data #datascience ...,Here are some of the best data science project...
2,2021-12-12,4021,2085,1188,0,533,41,11,1,131,62,12,19.354839,Learn how to train a machine learning model an...,#data #datascience #dataanalysis #dataanalytic...,Learn how to train a machine learning model an...
3,2021-12-13,4528,2700,621,932,73,172,10,7,213,23,8,34.782609,Here’s how you can write a Python program to d...,#python #pythonprogramming #pythonprojects #py...,Here’s how you can write a Python program to d...
4,2021-12-14,2518,1704,255,279,37,96,5,4,123,8,0,0.0,Plotting annotations while visualizing your da...,#datavisualization #datascience #data #dataana...,Plotting annotations while visualizing your da...


In [4]:
# Vectorize the content (Caption + Hashtags) using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['content'])

# Normalize engagement metrics (Likes, Comments, Shares, etc.)
scaler = MinMaxScaler()
engagement_metrics = ['Likes', 'Comments', 'Shares', 'Saves', 'Profile Visits', 'Follows']
df[engagement_metrics] = scaler.fit_transform(df[engagement_metrics])

# Combine TF-IDF matrix with engagement features
combined_features = pd.concat([pd.DataFrame(tfidf_matrix.toarray()), df[engagement_metrics].reset_index(drop=True)], axis=1)

# Compute cosine similarity matrix for the dataset
similarity_matrix = cosine_similarity(combined_features)

# Function to recommend posts based on input caption
def recommend_by_caption(input_caption, num_recommendations=5):
    # Preprocess input caption
    input_caption_processed = input_caption

    # Vectorize the input caption using the same TF-IDF vectorizer
    input_caption_vector = tfidf.transform([input_caption_processed])

    # Create a combined feature vector for the input (only using content for now)
    input_combined_features = pd.concat([pd.DataFrame(input_caption_vector.toarray()),
                                         pd.DataFrame([[0]*len(engagement_metrics)])], axis=1)

    # Compute cosine similarity between the input caption and all posts in the dataset
    similarity_scores = cosine_similarity(input_combined_features, combined_features).flatten()

    # Sort the posts based on similarity scores
    similar_posts_indices = similarity_scores.argsort()[::-1][:num_recommendations]

    # Return the top N similar posts
    return df.iloc[similar_posts_indices]

# Example: Input a caption to get recommendations
input_caption = "stressful life can lead to many problems in future"
recommended_posts = recommend_by_caption(input_caption=input_caption, num_recommendations=5)
print(recommended_posts[['Date', 'Caption', 'Hashtags', 'Likes', 'Comments', 'Shares']])


          Date                                            Caption  \
26  2022-01-05  You must have heard or invested in any cryptoc...   
95  2022-03-15  You must have heard or invested in any cryptoc...   
65  2022-02-13  If you want to know how to predict the future ...   
18  2021-12-28  Stress, anxiety, and depression are threatenin...   
85  2022-03-05  Stress, anxiety, and depression are threatenin...   

                                             Hashtags     Likes  Comments  \
26  #data #datascience #dataanalysis #dataanalytic...  0.037147  0.678571   
95  #data #datascience #dataanalysis #dataanalytic...  0.037147  0.678571   
65  #data #datascience #dataanalysis #dataanalytic...  0.011887  0.107143   
18  #data #datascience #dataanalysis #dataanalytic...  0.039128  0.214286   
85  #data #datascience #dataanalysis #dataanalytic...  0.039128  0.214286   

      Shares  
26  0.012712  
95  0.012712  
65  0.010593  
18  0.012712  
85  0.012712  


In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [8]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Text Preprocessing function


def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Fill missing values in 'Caption' and 'Hashtags' columns
df['Caption'] = df['Caption'].fillna('')
df['Hashtags'] = df['Hashtags'].fillna('')

# Apply text preprocessing
df['Caption'] = df['Caption'].apply(preprocess_text)
df['Hashtags'] = df['Hashtags'].apply(preprocess_text)

# Combine Caption and Hashtags for vectorization
df['content'] = df['Caption'] + ' ' + df['Hashtags']

In [11]:
# Use TF-IDF for text vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['content'])

# Normalize engagement metrics
scaler = MinMaxScaler()
engagement_metrics = ['Likes', 'Comments',
                      'Shares', 'Saves', 'Profile Visits', 'Follows']
df[engagement_metrics] = scaler.fit_transform(df[engagement_metrics])

# Combine TF-IDF matrix with engagement features
engagement_features = df[engagement_metrics].values

# Ensure both arrays are 2D and can be concatenated
combined_features = np.hstack([tfidf_matrix.toarray(), engagement_features])

# Compute cosine similarity matrix for combined features
similarity_matrix = cosine_similarity(combined_features)

In [12]:
# Function to recommend posts based on input caption with more weight on engagement
def recommend_by_caption(input_caption, num_recommendations=5, engagement_weight=0.3):
    # Preprocess input caption
    input_caption_processed = preprocess_text(input_caption)

    # Vectorize the input caption using the same TF-IDF vectorizer
    input_caption_vector = tfidf.transform([input_caption_processed])

    # Initialize engagement metrics for input to zero and reshape to 2D
    zeros_for_engagement = np.zeros(
        engagement_features.shape[1]).reshape(1, -1)

    # Create a combined feature vector for the input
    input_combined_features = np.hstack(
        [input_caption_vector.toarray(), zeros_for_engagement])

    # Compute cosine similarity between input caption and all posts
    similarity_scores = cosine_similarity(
        input_combined_features, combined_features).flatten()

    # Sort the posts based on similarity scores
    similar_posts_indices = similarity_scores.argsort()[
        ::-1][:num_recommendations]

    # Return the top N similar posts
    return df.iloc[similar_posts_indices]

In [13]:
# Example: Input a caption to get recommendations with improved accuracy
input_caption = "stressful life leads to health problems"
recommended_posts = recommend_by_caption(input_caption=input_caption, num_recommendations=5)
print(recommended_posts[['Date', 'Caption', 'Hashtags', 'Likes', 'Comments', 'Shares']])


          Date                                            Caption  \
85  2022-03-05  stress anxieti depress threaten mental health ...   
18  2021-12-28  stress anxieti depress threaten mental health ...   
64  2022-02-12  use python script use solv realtim problem pro...   
1   2021-12-11  best data scienc project idea healthcar want b...   
61  2022-02-09  best data scienc project idea healthcar want b...   

                                             Hashtags     Likes  Comments  \
85  data datasci dataanalysi dataanalyt datascient...  0.039128  0.214286   
18  data datasci dataanalysi dataanalyt datascient...  0.039128  0.214286   
64  python pythonprogram pythonproject pythoncod p...  0.097573  0.250000   
1   healthcar health covid data datasci dataanalys...  0.075285  0.250000   
61  healthcar health covid data datasci dataanalys...  0.087172  0.285714   

      Shares  
85  0.012712  
18  0.012712  
64  0.025424  
1   0.029661  
61  0.010593  
