Influencers Model Building

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df = pd.read_csv("/content/clean_influencer_data.csv")
df.head()

Unnamed: 0,Username,Platform,Category / Niche,Followers Count,Engagement Rate %,Average Likes,Average Comments,Location,Influencer Tier,Engagement Score,Follower Tier Score,Influencer Score,Potential Category,Contact Available
0,24h_social,instagram,digital creator,12409,0.0,0.08,0.0,usa,micro,1,5,68,medium potential,1
1,360marketing__,instagram,marketing agency,9969,0.0,0.5,0.07,global,micro,1,5,68,medium potential,1
2,aalchemy.growth,instagram,education,128,0.2,48.25,6.13,global,below threshold,1,0,28,low potential,1
3,prasegi_art,instagram,artist,6521,0.32,20.85,0.31,global,micro,1,5,68,medium potential,1
4,aceacademyamsc,instagram,educational consultant,195,0.0,0.0,0.0,global,below threshold,1,0,28,low potential,1


Select Feature and Target

In [None]:
features = [
    'Followers Count',
    'Engagement Rate %',
    'Average Likes',
    'Average Comments',
    'Influencer Score'
]

X = df[features]
y = df['Potential Category']

In [None]:
#Encode Target Labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

label_encoder.classes_

array(['high potential', 'low potential', 'medium potential'],
      dtype=object)

In [None]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

Train Logistic Regression

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#Evaluate model
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.5024390243902439

Classification Report:

                  precision    recall  f1-score   support

  high potential       0.42      0.11      0.18        71
   low potential       0.00      0.00      0.00        32
medium potential       0.51      0.93      0.66       102

        accuracy                           0.50       205
       macro avg       0.31      0.35      0.28       205
    weighted avg       0.40      0.50      0.39       205



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
df['Predicted Category'] = label_encoder.inverse_transform(
    model.predict(X)
)

df[['Username', 'Potential Category', 'Predicted Category']]

Unnamed: 0,Username,Potential Category,Predicted Category
0,24h_social,medium potential,medium potential
1,360marketing__,medium potential,medium potential
2,aalchemy.growth,low potential,high potential
3,prasegi_art,medium potential,high potential
4,aceacademyamsc,low potential,high potential
...,...,...,...
1017,pingmedia_learning,low potential,medium potential
1018,red.ads_,medium potential,medium potential
1019,alluringmonks,low potential,medium potential
1020,thedigiminds_official,low potential,medium potential


Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

Train Random Forest model

In [16]:
rf_model = RandomForestClassifier(
    n_estimators=100,      # number of trees
    random_state=42
)

rf_model.fit(X_train, y_train)

Evaluate Model

In [17]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

Random Forest Accuracy: 1.0

Classification Report:

                  precision    recall  f1-score   support

  high potential       1.00      1.00      1.00        71
   low potential       1.00      1.00      1.00        32
medium potential       1.00      1.00      1.00       102

        accuracy                           1.00       205
       macro avg       1.00      1.00      1.00       205
    weighted avg       1.00      1.00      1.00       205



In [18]:
import pandas as pd

importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(importance)

             Feature  Importance
4   Influencer Score    0.638363
1  Engagement Rate %    0.113104
2      Average Likes    0.104965
0    Followers Count    0.092473
3   Average Comments    0.051095


Compare Logistic VS Random Forest

In [19]:
print("Logistic Accuracy:", accuracy_score(y_test, y_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Logistic Accuracy: 0.5024390243902439
Random Forest Accuracy: 1.0
