# 📈 Engagement Rate Prediction - Instagram Influencers Dataset
This notebook trains a RandomForestRegressor model to predict the engagement rate using followers, influence score, and country.

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [None]:
# 📂 Load Dataset
df = pd.read_csv("top_instagram_influencers.csv")
df.head()

In [None]:
# 🧹 Data Cleaning
replace = {'b': 'e9', 'm': 'e6', 'k': 'e3', '%': ''}
cols = ['followers', 'avg_likes', '60_day_eng_rate', 'new_post_avg_like', 'total_likes', 'posts']
df[cols] = df[cols].replace(replace, regex=True).astype(float)
df.dropna(inplace=True)
df['country_encoded'] = LabelEncoder().fit_transform(df['country'].astype(str))
df.head()

In [None]:
# 🎯 Feature Selection
X = df[['followers', 'influence_score', 'country_encoded']]
y = df['60_day_eng_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 🚀 Train Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")

In [None]:
# 📉 Plot Actual vs Predicted
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Engagement Rate")
plt.ylabel("Predicted Engagement Rate")
plt.title("Actual vs Predicted Engagement Rate")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--', color='red')
plt.show()

In [None]:
# 💾 Save the model
joblib.dump(model, "engagement_model.pkl")
print("Model saved as engagement_model.pkl")