In [1]:
# =========================================
# Instagram Influencers Project (with saving)
# =========================================

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix

# Step 2: Create a folder for outputs
output_dir = "project_outputs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Step 3: Load Dataset
df = pd.read_csv("top_insta_influencers_data.csv")

# Step 4: Data Cleaning Function
def convert_to_numeric(x):
    if isinstance(x, str):
        x = x.lower().replace(",", "").strip()
        if x.endswith("k"):
            return float(x[:-1]) * 1e3
        elif x.endswith("m"):
            return float(x[:-1]) * 1e6
        elif x.endswith("b"):
            return float(x[:-1]) * 1e9
        elif x.endswith("%"):
            return float(x[:-1])
        else:
            try:
                return float(x)
            except:
                return None
    return x

cols_to_convert = ["posts", "followers", "avg_likes", "60_day_eng_rate",
                   "new_post_avg_like", "total_likes"]

for col in cols_to_convert:
    df[col] = df[col].apply(convert_to_numeric)

df.dropna(inplace=True)
# ADD THIS LINE: Save the cleaned data to a new file
df.to_csv("top_insta_influencers_data_cleaned.csv", index=False)

print("Cleaned data has been saved to top_insta_influencers_data_cleaned.csv")

# Step 5: EDA Plots (Saved)

# Distribution of Influence Score
plt.figure(figsize=(8,5))
sns.histplot(df['influence_score'], bins=20, kde=True, color="skyblue")
plt.title("Distribution of Influence Scores")
plt.savefig(f"{output_dir}/distribution_influence_scores.png")
plt.close()

# Followers vs Engagement Rate
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x="followers", y="60_day_eng_rate", hue="country", alpha=0.7)
plt.xscale("log")
plt.title("Followers vs 60-Day Engagement Rate")
plt.savefig(f"{output_dir}/followers_vs_engagement.png")
plt.close()

# Top 10 Countries
top_countries = df['country'].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(x=top_countries.values, y=top_countries.index, palette="viridis")
plt.title("Top 10 Countries by Influencer Count")
plt.savefig(f"{output_dir}/top_countries.png")
plt.close()

# Step 6: Feature Engineering
df['like_follower_ratio'] = df['total_likes'] / df['followers']
df['post_follower_ratio'] = df['posts'] / df['followers']
df['avg_likes_ratio'] = df['avg_likes'] / df['followers']

# Step 7: Regression - Predict Influence Score
X_reg = df[['followers', 'avg_likes', '60_day_eng_rate', 'new_post_avg_like',
            'like_follower_ratio', 'post_follower_ratio', 'avg_likes_ratio']]
y_reg = df['influence_score']

X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_model.fit(X_train_scaled, y_train)
y_pred_reg = reg_model.predict(X_test_scaled)

reg_mse = mean_squared_error(y_test, y_pred_reg)
reg_r2 = r2_score(y_test, y_pred_reg)

# Save Regression Results
with open(f"{output_dir}/regression_results.txt", "w") as f:
    f.write(f"Mean Squared Error: {reg_mse}\n")
    f.write(f"R² Score: {reg_r2}\n")

# Step 8: Classification - Engagement Rate Classes
bins = [0, 1, 3, df['60_day_eng_rate'].max()]
labels = ['Low', 'Medium', 'High']
df['engagement_class'] = pd.cut(df['60_day_eng_rate'], bins=bins, labels=labels)

X_cls = df[['followers', 'influence_score', 'avg_likes']]
y_cls = df['engagement_class']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_c, y_train_c)
y_pred_cls = clf.predict(X_test_c)

cls_acc = accuracy_score(y_test_c, y_pred_cls)
cls_report = classification_report(y_test_c, y_pred_cls)

# Save Classification Results
with open(f"{output_dir}/classification_results.txt", "w") as f:
    f.write(f"Accuracy: {cls_acc}\n")
    f.write(f"{cls_report}\n")

# Confusion Matrix
cm = confusion_matrix(y_test_c, y_pred_cls, labels=labels)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix - Engagement Classification")
plt.savefig(f"{output_dir}/confusion_matrix.png")
plt.close()

# Feature Importance for Regression
feat_importances = pd.Series(reg_model.feature_importances_, index=X_reg.columns)
plt.figure(figsize=(8,5))
feat_importances.sort_values().plot(kind='barh')
plt.title("Feature Importance - Influence Score Prediction")
plt.savefig(f"{output_dir}/feature_importance.png")
plt.close()



Cleaned data has been saved to top_insta_influencers_data_cleaned.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_countries.values, y=top_countries.index, palette="viridis")
