In [1]:
import pandas as pd
from src.data_processor import preprocess_and_combine_data, clean_text # Import functions from our new module

# --- Define file paths ---
# Make sure 'dataset - Sheet1.csv' is in your 'data/' folder.
existing_data_input_path = '../data/dataset - Sheet1.csv'
# The combined, preprocessed data will be saved to 'data/combined_preprocessed_influencer_data.csv'
output_combined_data_path = '../data/combined_preprocessed_influencer_data.csv'

# --- Run the data preprocessing ---
# This will load existing data, generate dummy data, combine, clean, and save it.
print("Starting Data Cleaning and Preprocessing...")
preprocessed_df = preprocess_and_combine_data(
    existing_data_path=existing_data_input_path,
    output_file_path=output_combined_data_path
)

if preprocessed_df is not None and not preprocessed_df.empty:
    print("\nData preprocessing complete. DataFrame head:")
    print(preprocessed_df.head())
    print(f"Total rows after preprocessing: {len(preprocessed_df)}")
else:
    print("No preprocessed data available.")


Starting Data Cleaning and Preprocessing...
Loaded existing data from ../data/dataset - Sheet1.csv. Initial shape: (599, 8)
Combined data. Total shape: (599, 8)

Starting data preprocessing...
Data preprocessing complete. Final DataFrame head:
    platform   username   post_date  \
0  Instagram  shanudrie  07/07/2025   
1  Instagram  shanudrie  07/11/2025   
2  Instagram  shanudrie  07/10/2025   
3  Instagram  shanudrie  07/11/2025   
4  Instagram  shanudrie               

                                        caption_text post_type  likes  \
0  Back at one of my most favorite go-to spots in...     Video  19300   
1  My first time trying crazy fusion food at a fo...     Image  10900   
2  Here are the answers from my doctor himself, i...     Video   7749   
3  Had a long day? Refresh your self with the sce...     Video   3190   
4  Sunday vlogging just got better with the @anch...     Video  17700   

   comments                                           hashtags  \
0        58     

In [2]:
# notebooks/influencer_recommender_model.ipynb - New Cell (This is the code you should run now)

import pandas as pd
# Import the recommend_influencers function from our new src/recommender_utils.py module
from src.recommender_utils import recommend_influencers

# --- Define User's Business Description ---
user_business_description_input = "book"

# --- Define the path to your preprocessed data file ---
# This file was generated in Step 3.
preprocessed_data_path = '../data/combined_preprocessed_influencer_data.csv'

# --- Run the influencer recommendation process ---
print("Starting Influencer Recommendation (Feature Extraction & Similarity Calculation)...")
recommended_influencers_df = recommend_influencers(
    user_business_description=user_business_description_input,
    data_file_path=preprocessed_data_path,
    top_n=5 # Number of top influencers to recommend
)

if recommended_influencers_df is not None:
    print("\nRecommendation Process Complete!")
    print("Top Recommended Influencers:")
    print(recommended_influencers_df)
else:
    print("Influencer recommendation failed. Please check previous output for errors.")


Starting Influencer Recommendation (Feature Extraction & Similarity Calculation)...
Loaded preprocessed data from ../data/combined_preprocessed_influencer_data.csv. Shape: (599, 10)
Aggregated content for 20 unique influencers.
Influencer profiles (aggregated text & avg engagement) created.

Starting TF-IDF Vectorization...
TF-IDF vectorization complete. Text data converted to numerical vectors.

Starting Cosine Similarity Calculation...
Cosine similarity calculation complete. Scores added to DataFrame.

Top Recommended Influencers (based on current processing):
          username  similarity_score     avg_likes  avg_comments
8       luna_peech          0.318635    528.766667     35.366667
0   _wildcookbook_          0.000000   5384.133333    136.833333
11  praveenaonline          0.000000   7386.600000     52.733333
18     vinu_speaks          0.000000   6549.966667    113.466667
17  umariaofficial          0.000000  44660.900000   3623.033333

Recommendation Process Complete!
Top Rec

In [3]:

 # --- Define file paths ---
# Make sure 'dataset - Sheet1.csv' is in your 'data/' folder.
existing_data_input_path = '../data/dataset - Sheet1.csv'
# The combined, preprocessed data will be saved to 'data/combined_preprocessed_influencer_data.csv'
output_combined_data_path = '../data/combined_preprocessed_influencer_data.csv'

# all influencer username get
import pandas as pd

def get_score_by_influencer_score(influencer_name):
    """
    Get score and average likes for a specific influencer
    """
    try:
        # Load the influencer scores CSV
        df = pd.read_csv('../data/influencer_scores.csv')

        # Filter for the specific influencer
        influencer_data = df[df['username'] == influencer_name]

        if influencer_data.empty:
            print(f"❌ Influencer '{influencer_name}' not found in the dataset!")
            return None

        # Extract the data for the influencer
        row = influencer_data.iloc[0]
        avg_likes = row['avg_likes']
        score = row['score']
        total_likes = row['total_likes']
        post_count = row['post_count']

        # Display the results
        print(f"📊 Influencer: @{influencer_name}")
        print(f"   💖 Average Likes per Post: {avg_likes:,.0f}")
        print(f"   🎯 Score: {score:,.2f}")
        print(f"   ❤️  Total Likes: {total_likes:,.0f}")
        print(f"   📝 Total Posts: {post_count}")

        return {
            'username': influencer_name,
            'avg_likes': avg_likes,
            'score': score,
            'total_likes': total_likes,
            'post_count': post_count
        }

    except FileNotFoundError:
        print("❌ Error: influencer_scores.csv file not found!")
        return None
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

def get_all_influencer_usernames():
    """
    Extract all unique influencer usernames from the dataset
    """
    try:
        # Load the data
        df = pd.read_csv(output_combined_data_path)

        # Get unique usernames
        unique_usernames = df['username'].unique()

        print(f"Total unique influencers found: {len(unique_usernames)}")
        print("\nAll Influencer Usernames:")
        print("-" * 40)



        return unique_usernames

    except FileNotFoundError:
        print(f"Error: File '{output_combined_data_path}' not found!")
        return None
    except KeyError:
        print("Error: 'username' column not found in the dataset!")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

get_all_influencer_usernames()

# Test the function with some examples
print("🔍 Testing the function with sample influencers:")
print("=" * 50)

# Test with a known influencer from the CSV
test_influencer = get_score_by_influencer_score("sarangadisasekara")

print("\n" + "-" * 50)

# Test with another influencer
test_influencer2 = get_score_by_influencer_score("yohanimusic")

print("\n" + "-" * 50)

# Test with a non-existent influencer
test_influencer3 = get_score_by_influencer_score("nonexistent_user")


Total unique influencers found: 20

All Influencer Usernames:
----------------------------------------
🔍 Testing the function with sample influencers:
📊 Influencer: @sarangadisasekara
   💖 Average Likes per Post: 230,492
   🎯 Score: 345,737.65
   ❤️  Total Likes: 6,914,753
   📝 Total Posts: 30

--------------------------------------------------
📊 Influencer: @yohanimusic
   💖 Average Likes per Post: 44,720
   🎯 Score: 67,080.75
   ❤️  Total Likes: 1,341,615
   📝 Total Posts: 30

--------------------------------------------------
❌ Influencer 'nonexistent_user' not found in the dataset!
