In [1]:
# Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
import re
import gensim
import spacy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
pip install spacy

Collecting spacy
  Downloading spacy-3.5.3-cp310-cp310-win_amd64.whl (12.2 MB)
     ---------------------------------------- 12.2/12.2 MB 2.2 MB/s eta 0:00:00
Collecting pathy>=0.10.0
  Downloading pathy-0.10.1-py3-none-any.whl (48 kB)
     ---------------------------------------- 48.9/48.9 kB 2.6 MB/s eta 0:00:00
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4
  Downloading pydantic-1.10.7-cp310-cp310-win_amd64.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 3.5 MB/s eta 0:00:00
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
     -------------------------------------- 181.6/181.6 kB 3.6 MB/s eta 0:00:00
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.8-cp310-cp310-win_amd64.whl (94 kB)
     ---------------------------------------- 94.7/94.7 kB 5.3 MB/s eta 0:00:00
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmu

In [None]:
# Data ingestion

In [2]:
file_path = "instagram_reach.csv"
df = pd.read_csv(file_path)
df.drop("Unnamed: 0", axis=1, inplace=True)
df.head()

Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30


Basic Analysis

In [3]:
df.shape

(100, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   S.No               100 non-null    int64 
 1   USERNAME           100 non-null    object
 2   Caption            94 non-null     object
 3   Followers          100 non-null    int64 
 4   Hashtags           100 non-null    object
 5   Time since posted  100 non-null    object
 6   Likes              100 non-null    int64 
dtypes: int64(3), object(4)
memory usage: 5.6+ KB



Observations:

S.No has no use can be dropped.
Username, caption, hastags are three columns containing text data.
Time since posted column can be converted to numerical values easily.
Time since posted and Likes columns are the target features.
Each users has specific number of followers and the number of likes directly depend on the number of followers. So we can drop the username column also.

In [6]:
#Username

len(df.USERNAME.unique())

95

Out of 100 rows in the dataset, there are 95 different usenames.


So, One-hot encoding is not a considerable option here because it will lead to 95 new features.

On the other hand, we have a numeric feature called "followers" which is directly associated with the usenames in the dataset.


conclusion: drop the "USERNAME" and "S, No" columns.

In [7]:
# Drop USERNAME and S.No column

df.drop(['S.No', 'USERNAME'], axis=1, inplace=True)

df.head(3)

Unnamed: 0,Caption,Followers,Hashtags,Time since posted,Likes
0,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25


In [8]:
# Time since posted
df['Time since posted'].unique()

array(['11 hours', '2 hours', '3 hours', '4 hours', '7 hours', '8 hours',
       '9 hours', '5 hours', '20 hours', '14 hours', '24 hours'],
      dtype=object)

In [9]:
df['Time since posted'].isna().sum()

0

In [10]:
df['Time since posted'].dtype

dtype('O')

'Time since posted' is one of our target features.
There are no missing values.
Data type is 'Object'
We need to convert the data type to integer.

In [11]:
# Converting 'Time since posted' column to integer type

df['Time since posted'] = df['Time since posted'].apply(lambda x: int(x.split()[0]))

df['Time since posted'].dtype

dtype('int64')

In [12]:
df['Time since posted'].unique()

array([11,  2,  3,  4,  7,  8,  9,  5, 20, 14, 24], dtype=int64)

In [13]:
# Caption
 # list of length of each caption text
print(sorted([len(str(cap)) for cap in df['Caption'].values]))

[2, 3, 3, 3, 3, 3, 3, 4, 5, 7, 12, 19, 21, 21, 21, 26, 26, 28, 28, 29, 31, 31, 32, 38, 38, 41, 42, 44, 45, 51, 55, 56, 56, 57, 63, 65, 70, 70, 72, 73, 74, 75, 79, 90, 93, 94, 95, 104, 110, 112, 113, 114, 116, 118, 119, 124, 125, 128, 131, 137, 144, 145, 149, 153, 155, 159, 166, 172, 175, 177, 180, 195, 196, 204, 204, 214, 217, 218, 222, 226, 233, 248, 255, 259, 260, 261, 270, 273, 322, 325, 348, 353, 369, 373, 390, 422, 454, 457, 459, 704]


In [14]:
df = df[df['Caption'].str.len() >= 20]
df.shape

(88, 5)

In [15]:

df['Caption'][0]

'Who are #DataScientist and what do they do? >> @MikeQuindazzi >> #MachineLearning #AI #DataAnalytics #DataScienc #DataLake >> https://buff.ly/2kYmF0s'

In [16]:
# Helper function to clean caption text
def clean_caption(text):
    if isinstance(text, str): # to avoid TypeError
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove emojis and symbols
        text = re.sub(r'[^\w\s]', '', text)
        
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# tokenizer function
def tokenizer(text):
    text = str(text)

    # Tokenize the text
    tokens = word_tokenize(text)

    return tokens

In [19]:
# cleaning texts in caption column
df['Caption'] = df['Caption'].apply(clean_caption)

df['Caption']

0     Who are DataScientist and what do they do Mike...
1     We all know where its going We just have to wo...
2     Alexander Barinov 4 years as CFO in multinatio...
4     Ever missed a call while your phone was chargi...
5     Cyber attacks are more frequent every day and ...
                            ...                        
95    328 S Wetherly Drive Beverly Hills CA 90212 Th...
96    Credit tristankappel To find more dvlp follow ...
97    We are coming up with the Best 21 Books that w...
98    Were only paid to move dirt once Its not just ...
99                       Obtén tu tienda en línea ahora
Name: Caption, Length: 88, dtype: object

In [25]:
import gensim



In [34]:
# Download word2vec pre-trained vectors published by Google
# Download link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g

# Load pre-trained Word2Vec model
from gensim.models import KeyedVectors
from gensim import models


word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Function to perform word embedding
def embed_caption(tokens):
    embeddings = [word2vec_model[word.lower()] for word in tokens if word.lower() in word2vec_model]
    return embeddings

In [35]:

# Apply word embedding to the 'caption' column
df['caption_embedding'] = df['Caption'].apply(embed_caption)


In [36]:
df[['Caption','caption_embedding']]

Unnamed: 0,Caption,caption_embedding
0,Who are #DataScientist and what do they do? >>...,"[[-0.23339844, 0.061523438, -0.3046875, 0.2246..."
1,We all know where it’s going. We just have to ...,"[[-0.23339844, 0.061523438, -0.3046875, 0.2246..."
2,Alexander Barinov: 4 years as CFO in multinati...,"[[-0.02722168, 0.23046875, -0.061767578, -0.06..."
4,Ever missed a call while your phone was chargi...,"[[-0.06933594, 0.15332031, -0.024902344, 0.172..."
5,Cyber attacks are more frequent every day and ...,"[[-0.20800781, 0.034179688, 0.025756836, 0.179..."
...,...,...
95,"328 S. Wetherly Drive, Beverly Hills, CA 90212...","[[0.114746094, -0.083496094, 0.033691406, 0.10..."
96,Credit @tristankappel To find more dvlp follow...,"[[-0.20800781, 0.034179688, 0.025756836, 0.179..."
97,We are coming up with the Best 21 Books that w...,"[[-0.23339844, 0.061523438, -0.3046875, 0.2246..."
98,We’re only paid to move dirt once. It’s not ju...,"[[-0.23339844, 0.061523438, -0.3046875, 0.2246..."


In [37]:
# Flatten the caption_embedding column
df['caption_embedding_flat'] = df['caption_embedding'].apply(lambda x: np.array(x).flatten())

df[['Caption','caption_embedding', 'caption_embedding_flat']]

Unnamed: 0,Caption,caption_embedding,caption_embedding_flat
0,Who are #DataScientist and what do they do? >>...,"[[-0.23339844, 0.061523438, -0.3046875, 0.2246...","[-0.23339844, 0.061523438, -0.3046875, 0.22460..."
1,We all know where it’s going. We just have to ...,"[[-0.23339844, 0.061523438, -0.3046875, 0.2246...","[-0.23339844, 0.061523438, -0.3046875, 0.22460..."
2,Alexander Barinov: 4 years as CFO in multinati...,"[[-0.02722168, 0.23046875, -0.061767578, -0.06...","[-0.02722168, 0.23046875, -0.061767578, -0.065..."
4,Ever missed a call while your phone was chargi...,"[[-0.06933594, 0.15332031, -0.024902344, 0.172...","[-0.06933594, 0.15332031, -0.024902344, 0.1728..."
5,Cyber attacks are more frequent every day and ...,"[[-0.20800781, 0.034179688, 0.025756836, 0.179...","[-0.20800781, 0.034179688, 0.025756836, 0.1796..."
...,...,...,...
95,"328 S. Wetherly Drive, Beverly Hills, CA 90212...","[[0.114746094, -0.083496094, 0.033691406, 0.10...","[0.114746094, -0.083496094, 0.033691406, 0.105..."
96,Credit @tristankappel To find more dvlp follow...,"[[-0.20800781, 0.034179688, 0.025756836, 0.179...","[-0.20800781, 0.034179688, 0.025756836, 0.1796..."
97,We are coming up with the Best 21 Books that w...,"[[-0.23339844, 0.061523438, -0.3046875, 0.2246...","[-0.23339844, 0.061523438, -0.3046875, 0.22460..."
98,We’re only paid to move dirt once. It’s not ju...,"[[-0.23339844, 0.061523438, -0.3046875, 0.2246...","[-0.23339844, 0.061523438, -0.3046875, 0.22460..."


# Analysis of the Hashtags column

In [38]:
# list of length of each hashtags text
print(sorted([len(str(cap)) for cap in df['Hashtags'].values]))

[21, 23, 26, 27, 44, 48, 57, 59, 61, 65, 69, 78, 80, 93, 94, 97, 98, 99, 104, 109, 112, 114, 116, 118, 120, 123, 123, 128, 135, 148, 149, 151, 182, 187, 188, 191, 191, 193, 194, 196, 198, 201, 202, 204, 204, 214, 215, 222, 228, 229, 233, 249, 251, 254, 255, 260, 261, 266, 266, 271, 275, 278, 279, 281, 286, 294, 297, 301, 301, 301, 306, 306, 307, 310, 311, 318, 328, 329, 331, 332, 334, 340, 348, 351, 365, 377, 386, 388]


In [42]:
import spacy
# Load spaCy's English language model
nlp = spacy.load('en_core_web_sm')

def embed_hashtags(hashtags):
    doc = nlp(hashtags)  # Tokenize and process the hashtag text
    embeddings = [word2vec_model[word.text.lower()] for word in doc if word.text.lower() in word2vec_model]
    return embeddings

In [43]:
df['hashtags_embedding'] = df['Hashtags'].apply(embed_hashtags)

In [44]:
df[['Hashtags', 'hashtags_embedding']]

Unnamed: 0,Hashtags,hashtags_embedding
0,#MachineLearning #AI #DataAnalytics #DataScien...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008..."
1,#deck .#mac #macintosh#sayhello #apple #steve...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008..."
2,#whoiswho #aitrading #ai #aitradingteam#instat...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008..."
4,#instamachinelearning #instabigdata#instamarke...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008..."
5,#edtech #learning#educationtechnology #tech #l...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008..."
...,...,...
95,#beverlyhills #realestate#losangelesrealestate...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008..."
96,#workspace #work #developer#development #devel...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008..."
97,#books #book #motivation #inspiration #life#bo...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008..."
98,#heavyequipment #underconstruction#dozer #real...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008..."


In [45]:
# Flatten the hasgtags_embedding column
df['hashtags_embedding_flat'] = df['hashtags_embedding'].apply(lambda x: np.array(x).flatten())

df[['Hashtags', 'hashtags_embedding', 'hashtags_embedding_flat']]

Unnamed: 0,Hashtags,hashtags_embedding,hashtags_embedding_flat
0,#MachineLearning #AI #DataAnalytics #DataScien...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008...","[-0.033447266, -0.21972656, 0.01940918, 0.0081..."
1,#deck .#mac #macintosh#sayhello #apple #steve...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008...","[-0.033447266, -0.21972656, 0.01940918, 0.0081..."
2,#whoiswho #aitrading #ai #aitradingteam#instat...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008...","[-0.033447266, -0.21972656, 0.01940918, 0.0081..."
4,#instamachinelearning #instabigdata#instamarke...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008...","[-0.033447266, -0.21972656, 0.01940918, 0.0081..."
5,#edtech #learning#educationtechnology #tech #l...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008...","[-0.033447266, -0.21972656, 0.01940918, 0.0081..."
...,...,...,...
95,#beverlyhills #realestate#losangelesrealestate...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008...","[-0.033447266, -0.21972656, 0.01940918, 0.0081..."
96,#workspace #work #developer#development #devel...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008...","[-0.033447266, -0.21972656, 0.01940918, 0.0081..."
97,#books #book #motivation #inspiration #life#bo...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008...","[-0.033447266, -0.21972656, 0.01940918, 0.0081..."
98,#heavyequipment #underconstruction#dozer #real...,"[[-0.033447266, -0.21972656, 0.01940918, 0.008...","[-0.033447266, -0.21972656, 0.01940918, 0.0081..."


### Training Data preperation

In [46]:
df.columns

Index(['Caption', 'Followers', 'Hashtags', 'Time since posted', 'Likes',
       'caption_embedding', 'caption_embedding_flat', 'hashtags_embedding',
       'hashtags_embedding_flat'],
      dtype='object')

In [47]:
# Get the 'caption_embedding_flat' arrays from the DataFrame
caption_embeddings = df['caption_embedding_flat'].values

# Determine the minimum size along the second axis (dimension 1)
min_size = min([len(arr) for arr in caption_embeddings])

# Trim the 'caption_embedding_flat' arrays to have the same size
caption_embeddings = [arr[:min_size] for arr in caption_embeddings]

# Stack the trimmed arrays using np.vstack
caption_embedding_flat = np.vstack(caption_embeddings)

In [49]:
# Get the 'hashtags_embedding_flat' arrays from the DataFrame
hashtags_embeddings = df['hashtags_embedding_flat'].values

# Determine the minimum size along the second axis (dimension 1)
min_size = min([len(arr) for arr in hashtags_embeddings])

# Trim the 'caption_embedding_flat' arrays to have the same size
hashtags_embeddings = [arr[:min_size] for arr in hashtags_embeddings]

# Stack the trimmed arrays using np.vstack
hashtags_embedding_flat = np.vstack(hashtags_embeddings)

In [50]:
# Reshape the 'Followers' array to have 2 dimensions
followers = df['Followers'].values.reshape(-1, 1)

In [52]:

# Convert embedded captions, followers, embedded hashtags, and target variables to numpy arrays
# Combine input features into a single array
X = np.hstack((caption_embedding_flat, hashtags_embedding_flat, followers))

# Extract target variables
y = df[['Time since posted', 'Likes']].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# model training

In [53]:
# Create and train a linear regression model
model = MultiOutputRegressor(LinearRegression())
model.fit(X_train, y_train)

In [54]:
# Evaluate the model
y_pred = model.predict(X_test)
test_score = model.score(X_test, y_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", rmse)

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print("R-squared (R2) Score:", r2)

Root Mean Squared Error (RMSE): 56.000158077658774
R-squared (R2) Score: -0.26314629063477646
