In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [5]:
df = pd.read_csv("preprocessed_social_media_data.csv")

In [17]:
### **1️⃣ Remove Non-Numeric Columns**
df.drop(columns=['Post ID', 'Audience Location', 'Audience Interests'], inplace=True)


In [19]:
### **1️⃣ Define Features & Target Variable**
target = "Engagement Rate"  # What we are predicting
features = df.drop(columns=[target]).columns  # All other columns are features

In [21]:
X = df[features]  # Feature set
y = df[target]  # Target variable

In [23]:
### **2️⃣ Split Data into Training & Testing Sets**
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
### **4️⃣ Train XGBoost Model**
xgb_model = xgb.XGBRegressor(
    n_estimators=100, 
    learning_rate=0.1, 
    max_depth=6, 
    enable_categorical=True,  # Enables categorical features if present
    random_state=42
)
xgb_model.fit(X_train, y_train)

In [29]:
### **5️⃣ Evaluate Model Performance**
y_pred = xgb_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

In [31]:
print(f"🔹 Model Performance:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

🔹 Model Performance:
Mean Absolute Error (MAE): 0.0023
Mean Squared Error (MSE): 0.0000
Root Mean Squared Error (RMSE): 0.0037
R² Score: 0.9991


In [39]:
### **5️⃣ Save the Trained Model**
model_path = "xgboost_engagement_model.pkl"  # Save in the current directory

joblib.dump(xgb_model, model_path)

print(f"\n✅ Model saved successfully at: {model_path}")


✅ Model saved successfully at: xgboost_engagement_model.pkl
