In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("instagram_reach.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30


In [3]:
print(df.columns)
print(df.dtypes)
print(df.shape)

Index(['Unnamed: 0', 'S.No', 'USERNAME', 'Caption', 'Followers', 'Hashtags',
       'Time since posted', 'Likes'],
      dtype='object')
Unnamed: 0            int64
S.No                  int64
USERNAME             object
Caption              object
Followers             int64
Hashtags             object
Time since posted    object
Likes                 int64
dtype: object
(100, 8)


In [4]:
def detail_info(data):
    temp_df = pd.DataFrame(index= data.columns)
    
    temp_df['data_type'] = data.dtypes
    temp_df['unique_val'] = data.nunique()
    temp_df['duplicate_val'] = data.duplicated().sum()
    temp_df['missing_val'] = data.isnull().sum()
    temp_df['missing_val_%'] = round(data.isnull().mean()*100,2)
    
    return temp_df

detail_info(df)

Unnamed: 0,data_type,unique_val,duplicate_val,missing_val,missing_val_%
Unnamed: 0,int64,27,0,0,0.0
S.No,int64,30,0,0,0.0
USERNAME,object,95,0,0,0.0
Caption,object,94,0,6,6.0
Followers,int64,94,0,0,0.0
Hashtags,object,98,0,0,0.0
Time since posted,object,11,0,0,0.0
Likes,int64,55,0,0,0.0


In [5]:
df.drop(["Unnamed: 0", "S.No"], axis=1, inplace=True)

In [6]:
df['Hashtags_Num'] = df['Hashtags'].str.strip().str.count('#')
df.head()

Unnamed: 0,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes,Hashtags_Num
0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139,5
1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23,28
2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25,30
3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49,30
4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30,26


Note: Maximum 31 hashtags are there for the caption

In [7]:
df['Hashtags_Num'].value_counts().sort_index()

1      1
2      2
3      1
4      3
5      5
6      2
7      1
8      1
9      7
10     5
11     4
12     1
13     1
14     2
15     3
17     4
18     2
19     1
20     5
21     1
22     2
23     2
25     5
26     5
27     4
28     7
29     5
30    17
31     1
Name: Hashtags_Num, dtype: int64

In [8]:
unique_hashtags = df['Hashtags'].str.split().explode().nunique()
print(f"{unique_hashtags} unique hashtags are in the dataset")

1124 unique hashtags are in the dataset


In [9]:
unique_hashtags = df['Hashtags'].str.split().explode().unique()
unique_hashtags

array(['#MachineLearning', '#AI', '#DataAnalytics', ..., '#website',
       '#paginaweb#followme', '#empresa'], dtype=object)

In [10]:
import spacy
import string
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [11]:
def spacy_tokenizer(sentence):
    
    mytokens = nlp(sentence)
# Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in mytokens]
# Removing stop words
    mytokens = [word for word in mytokens if word not in stop_words]
# Removing punctuations
    mytokens = [word for word in mytokens if word not in string.punctuation]
    
#    mytokens = ' '.join(mytokens)
   
    return mytokens

In [12]:
df['cleaned_caption'] = df['Caption'].fillna('').astype(str).apply(spacy_tokenizer)

df["Caption_Num"] = df['cleaned_caption'].apply(len)
df.head()

Unnamed: 0,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes,Hashtags_Num,cleaned_caption,Caption_Num
0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139,5,"[datascientist, @mikequindazzi, machinelearnin...",8
1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23,28,"[know, workresponsibly, postpone, date, actual...",26
2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25,30,"[alexander, barinov, 4, year, cfo, multination...",55
3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49,30,[sfad],1
4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30,26,"[miss, phone, charge, system, notify, incoming...",24


In [13]:
df["Time_since_posted"] = df["Time since posted"].str.replace(" hours", "").astype('int')

In [14]:
data = df[['Followers', 'Hashtags_Num', 'Caption_Num', 'Time_since_posted', 'Likes']]
data.columns

Index(['Followers', 'Hashtags_Num', 'Caption_Num', 'Time_since_posted',
       'Likes'],
      dtype='object')

In [15]:
data.dtypes

Followers            int64
Hashtags_Num         int64
Caption_Num          int64
Time_since_posted    int32
Likes                int64
dtype: object

In [16]:
data.head()

Unnamed: 0,Followers,Hashtags_Num,Caption_Num,Time_since_posted,Likes
0,1600,5,8,11,139
1,880,28,26,2,23
2,255,30,55,2,25
3,340,30,1,3,49
4,304,26,24,3,30


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [18]:
X = data.drop(['Likes', 'Time_since_posted'], axis=1)

# Convert Likes and Time Since Posted columns to numeric values
y_likes = data['Likes']
y_time = data['Time_since_posted']

In [19]:
X_train, X_test, y_likes_train, y_likes_test, y_time_train, y_time_test = train_test_split(
                                                                            X, y_likes, y_time, test_size=0.2, random_state=42)

In [20]:
# Likes prediction
model_likes = RandomForestRegressor()
model_likes.fit(X_train, y_likes_train)

# Time Since Posted prediction
model_time = RandomForestRegressor()
model_time.fit(X_train, y_time_train)


RandomForestRegressor()

In [21]:
# Likes prediction evaluation
y_likes_pred = model_likes.predict(X_test)
likes_mae = mean_absolute_error(y_likes_test, y_likes_pred)

# Time Since Posted prediction evaluation
y_time_pred = model_time.predict(X_test)
time_mae = mean_absolute_error(y_time_test, y_time_pred)

# Print evaluation metrics
print("Likes Prediction:")
print("MAE:", likes_mae)

print("-----------")
print("Time Since Posted Prediction:")
print("MAE:", time_mae)


Likes Prediction:
MAE: 23.899
-----------
Time Since Posted Prediction:
MAE: 1.355
