In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [4]:
data = pd.read_csv('/content/instagram_reach.csv')
#download this dataset from https://www.kaggle.com/datasets/rxsraghavagrawal/instagram-reach
data

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30
...,...,...,...,...,...,...,...,...
95,8,19,michaelgarza__,"328 S. Wetherly Drive, Beverly Hills, CA 90212...",614,#beverlyhills #realestate#losangelesrealestate...,3 hours,31
96,9,21,dvlp_search,Credit @tristankappel To find more dvlp follow...,450,#workspace #work #developer#development #devel...,3 hours,42
97,10,22,ecom.space,We are coming up with the Best 21 Books that w...,182,#books #book #motivation #inspiration #life#bo...,3 hours,10
98,11,24,lb3enterprises,We’re only paid to move dirt once. It’s not ju...,2039,#heavyequipment #underconstruction#dozer #real...,3 hours,222


In [5]:
data.columns

Index(['Unnamed: 0', 'S.No', 'USERNAME', 'Caption', 'Followers', 'Hashtags',
       'Time since posted', 'Likes'],
      dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         100 non-null    int64 
 1   S.No               100 non-null    int64 
 2   USERNAME           100 non-null    object
 3   Caption            94 non-null     object
 4   Followers          100 non-null    int64 
 5   Hashtags           100 non-null    object
 6   Time since posted  100 non-null    object
 7   Likes              100 non-null    int64 
dtypes: int64(4), object(4)
memory usage: 6.4+ KB


we have to convert 'Time since posted' column to int type from object type and also convert hours to minutes 

In [7]:
# Remove "hours" suffix and convert to int
data['Time since posted'] = data['Time since posted'].str.replace(' hours', '').astype(int)

# Convert hours to minutes
data['Time since posted'] = data['Time since posted'] * 60

#rename the column
data.rename(columns={'Time since posted': 'minutes_since_posted'}, inplace=True)


data.head()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,minutes_since_posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,660,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,120,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,120,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,180,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,180,30


In [8]:
data

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,minutes_since_posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,660,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,120,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,120,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,180,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,180,30
...,...,...,...,...,...,...,...,...
95,8,19,michaelgarza__,"328 S. Wetherly Drive, Beverly Hills, CA 90212...",614,#beverlyhills #realestate#losangelesrealestate...,180,31
96,9,21,dvlp_search,Credit @tristankappel To find more dvlp follow...,450,#workspace #work #developer#development #devel...,180,42
97,10,22,ecom.space,We are coming up with the Best 21 Books that w...,182,#books #book #motivation #inspiration #life#bo...,180,10
98,11,24,lb3enterprises,We’re only paid to move dirt once. It’s not ju...,2039,#heavyequipment #underconstruction#dozer #real...,180,222


In [9]:
data[data.duplicated()]


Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,minutes_since_posted,Likes


In [10]:
data.isna().sum()

Unnamed: 0              0
S.No                    0
USERNAME                0
Caption                 6
Followers               0
Hashtags                0
minutes_since_posted    0
Likes                   0
dtype: int64

we dont have any duplicate rows and NA values in important columns



In [11]:
data = data.drop(data.columns[[0, 1]], axis=1)

In [12]:
data.head()

Unnamed: 0,USERNAME,Caption,Followers,Hashtags,minutes_since_posted,Likes
0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,660,139
1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,120,23
2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,120,25
3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,180,49
4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,180,30


In [13]:
# Specify the input columns and target columns
input_cols = ['Followers', 'Hashtags']
target_cols = ['Likes', 'minutes_since_posted']

# Preprocess the input data
X = data[input_cols]

# Preprocess the 'Followers' column (convert to numeric)
X['Followers'] = pd.to_numeric(X['Followers'], errors='coerce')

# Preprocess the 'Hashtags' column (extract the number of hashtags)
X['Hashtags'] = X['Hashtags'].apply(lambda x: len(x.split()))

# Perform one-hot encoding on the 'Hashtags' column 
encoder = OneHotEncoder(sparse=False, drop='first')
X_encoded = encoder.fit_transform(X[['Hashtags']])

# Concatenate the preprocessed features
X_processed = pd.concat([X['Followers'], pd.DataFrame(X_encoded)], axis=1)

# Convert column names to strings
X_processed.columns = X_processed.columns.astype(str)

# Preprocess the target variables
y = data[target_cols]

# Check for missing values in the preprocessed data
missing_values = X_processed.isnull().sum()
print("Missing Values:\n", missing_values)

# Check the dimensions of the preprocessed data
print("Input Data Dimensions:", X_processed.shape)
print("Target Data Dimensions:", y.shape)



Missing Values:
 Followers    0
0            0
1            0
2            0
3            0
4            0
5            0
6            0
7            0
8            0
9            0
10           0
11           0
12           0
13           0
14           0
15           0
16           0
17           0
18           0
19           0
20           0
21           0
22           0
23           0
24           0
dtype: int64
Input Data Dimensions: (100, 26)
Target Data Dimensions: (100, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Followers'] = pd.to_numeric(X['Followers'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Hashtags'] = X['Hashtags'].apply(lambda x: len(x.split()))


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=0)


In [37]:
X_train


Unnamed: 0,Followers,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,24
37,124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
51,2277,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,1168,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
95,614,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,971,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90,863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [38]:
y_train

Unnamed: 0,Likes,minutes_since_posted
37,24,120
51,157,240
35,13,120
65,29,120
31,19,120
...,...,...
28,43,240
95,31,180
15,53,180
90,32,120


In [39]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [42]:
# Get input from the user
followers = input("Enter the number of followers: ")
hashtags = input("Enter the hashtags separated by space: ")

# Preprocess the user input
new_data = pd.DataFrame({
    'Followers': [followers],
    'Hashtags': [hashtags]})

# Convert column names to strings
new_data.columns = new_data.columns.astype(str)

# Preprocess the 'Followers' column (convert to numeric)
new_data['Followers'] = pd.to_numeric(new_data['Followers'], errors='coerce')

# Preprocess the 'Hashtags' column (extract the number of hashtags)
new_data['Hashtags'] = new_data['Hashtags'].apply(lambda x: len(x.split()))

# Perform one-hot encoding on the 'Hashtags' column
new_data_encoded = encoder.transform(new_data[['Hashtags']])

# Concatenate the preprocessed features
new_data_processed = pd.concat([new_data['Followers'], pd.DataFrame(new_data_encoded)], axis=1)

new_data_processed.columns = new_data_processed.columns.astype(str)

# Make predictions on the new data
predictions = model.predict(new_data_processed)

# Extract the predicted likes and time since posted
predicted_likes = predictions[:, 0]
predicted_time = predictions[:, 1]

print("Predicted Likes:", predicted_likes)
print("Predicted Time Since Posted:", predicted_time)

Enter the number of followers: 1000
Enter the hashtags separated by space: #machinelearning #ai
Predicted Likes: [30.21]
Predicted Time Since Posted: [121.8]


In [43]:

# Evaluate the model's accuracy on the test set
y_pred = model.predict(X_test)
#mse_likes = mean_squared_error(y_test['Likes'], y_pred[:, 0])
mae_time = mean_absolute_error(y_test['minutes_since_posted'], y_pred[:, 1])
#mse_time = mean_squared_error(y_test['minutes_since_posted'], y_pred[:, 0])
mae_likes = mean_absolute_error(y_test['Likes'], y_pred[:, 1])

#print("Mean Squared Error (Likes):", mse_likes)
print("Mean Absolute Error (Time):", mae_time)
#print("Mean Square Error (Time):", mse_time)
print("Mean Absolute Error (Likes):", mae_likes)

Mean Absolute Error (Time): 88.27000000000001
Mean Absolute Error (Likes): 160.42999999999998
