# Question 1

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso , ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('instagram_reach.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30


In [15]:


df.shape

(100, 8)

In [16]:

df=df.drop(columns=['Unnamed: 0','S.No'])

In [17]:
df.head()

Unnamed: 0,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30


In [18]:
df.columns


Index(['USERNAME', 'Caption', 'Followers', 'Hashtags', 'Time since posted',
       'Likes'],
      dtype='object')

In [19]:
df.describe()

Unnamed: 0,Followers,Likes
count,100.0,100.0
mean,961.96,46.48
std,1014.62567,55.08698
min,11.0,8.0
25%,252.75,19.0
50%,612.0,29.0
75%,1197.0,46.0
max,4496.0,349.0


In [20]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   USERNAME           100 non-null    object
 1   Caption            94 non-null     object
 2   Followers          100 non-null    int64 
 3   Hashtags           100 non-null    object
 4   Time since posted  100 non-null    object
 5   Likes              100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


In [21]:
df.isnull().sum()

USERNAME             0
Caption              6
Followers            0
Hashtags             0
Time since posted    0
Likes                0
dtype: int64

In [22]:

df.dtypes

USERNAME             object
Caption              object
Followers             int64
Hashtags             object
Time since posted    object
Likes                 int64
dtype: object

In [23]:

df.duplicated().sum()


0

In [31]:
num_col=[fea for fea in df.columns if df[fea].dtypes!="o"]
cat_col=[fea for fea in df.columns if df[fea].dtypes=="o"]

In [26]:
# Convert 'Time since posted' to numerical format (hours)
df['Time since posted'] = df['Time since posted'].str.extract('(\d+)').astype(float)

In [27]:

df['Time since posted']

0     11.0
1      2.0
2      2.0
3      3.0
4      3.0
      ... 
95     3.0
96     3.0
97     3.0
98     3.0
99     3.0
Name: Time since posted, Length: 100, dtype: float64

In [28]:
# Feature engineering on hashtags (count of hashtags)
df['Num_Hashtags'] = df['Hashtags'].apply(lambda x: len(x.split()))

In [29]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [30]:
username_encoded = pd.get_dummies(df['USERNAME'], prefix='user')
hashtags_encoded = pd.get_dummies(df['Hashtags'].apply(lambda x: ' '.join(x.split())), prefix='tag')

df = pd.concat([df, username_encoded, hashtags_encoded], axis=1)

# Drop unnecessary columns
df.drop(['USERNAME', 'Caption', 'Hashtags'], axis=1, inplace=True)

# Encode the target variable 'Likes' using LabelEncoder (optional)
label_encoder = LabelEncoder()
df['Likes'] = label_encoder.fit_transform(df['Likes'])

In [32]:
X = df.drop('Likes', axis=1)
y_likes = df['Likes']
y_time = df['Time since posted']

In [33]:
# Train-test split
X_train, X_test, y_likes_train, y_likes_test, y_time_train, y_time_test = train_test_split(
    X, y_likes, y_time, test_size=0.2, random_state=42)

In [34]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [44]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Elastic Net':ElasticNet(),
    'SVR': SVR(),
    'Decision Tree': DecisionTreeRegressor()
}

In [45]:
def train_and_evaluate_models(models, X_train, y_likes_train, y_time_train, X_test, y_likes_test, y_time_test):
  """
  Trains and evaluates the provided models for predicting likes and time since posted.

  
      models : Dictionary containing the models to train.
      X_train : Training data for features.
      y_likes_train : Training data for likes.
      y_time_train : Training data for time since posted.
      X_test : Testing data for features.
      y_likes_test : Testing data for likes.
      y_time_test : Testing data for time since posted.
  """
  for model_name, model in models.items():
    # Train the model for likes prediction
    model.fit(X_train, y_likes_train)
    y_likes_pred = model.predict(X_test)

    # Evaluate likes prediction
    likes_r2 = r2_score(y_likes_test, y_likes_pred)
    likes_mse = mean_squared_error(y_likes_test, y_likes_pred)
    likes_mae = mean_absolute_error(y_likes_test, y_likes_pred)

    # Train the model for time since posted prediction
    model.fit(X_train, y_time_train)
    y_time_pred = model.predict(X_test)

    # Evaluate time since posted prediction
    time_r2 = r2_score(y_time_test, y_time_pred)
    time_mse = mean_squared_error(y_likes_test, y_time_pred)
    time_mae = mean_absolute_error(y_likes_test, y_time_pred)

    # Print evaluation metrics
    print(f"Model: {model_name}")
    print(f"\tLikes Prediction - R2: {likes_r2:.4f}, MSE: {likes_mse:.2f}, MAE: {likes_mae:.2f}")
    print(f"\tTime Since Posted Prediction - R2: {time_r2:.4f}, MSE: {time_mse:.2f}, MAE: {time_mae:.2f}")
    print("-"*50)

# Train and evaluate the models
train_and_evaluate_models(models, X_train, y_likes_train, y_time_train, X_test, y_likes_test, y_time_test)

Model: Linear Regression
	Likes Prediction - R2: 0.3132, MSE: 133.35, MAE: 9.16
	Time Since Posted Prediction - R2: 0.9972, MSE: 435.42, MAE: 17.17
--------------------------------------------------
Model: Ridge
	Likes Prediction - R2: -0.0203, MSE: 198.10, MAE: 10.80
	Time Since Posted Prediction - R2: 0.5662, MSE: 482.25, MAE: 17.70
--------------------------------------------------
Model: Lasso
	Likes Prediction - R2: 0.1113, MSE: 172.55, MAE: 10.04
	Time Since Posted Prediction - R2: 0.9084, MSE: 453.00, MAE: 17.32
--------------------------------------------------
Model: Elastic Net
	Likes Prediction - R2: 0.0553, MSE: 183.41, MAE: 10.31
	Time Since Posted Prediction - R2: 0.6431, MSE: 478.26, MAE: 17.65
--------------------------------------------------
Model: SVR
	Likes Prediction - R2: -0.0423, MSE: 202.36, MAE: 10.81
	Time Since Posted Prediction - R2: -0.1501, MSE: 538.22, MAE: 18.63
--------------------------------------------------
Model: Decision Tree
	Likes Prediction - R