In [None]:
import requests
from bs4 import BeautifulSoup
import re
import random
import time
import pandas as pd
import os

# We are going to search in google for the name of the Actor plus the word Instagram to get better results and with this take the number of followers of
# each actor.
# Put your user agent here
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36; Sofia Villamil / sofia.v1999@gmail.com'    
]

def get_random_user_agent():
    return random.choice(user_agents)

def log_request_info(search_query, attempt, error=None):
    with open("scraping_log.txt", "a") as log_file:
        log_file.write(f"Attempt {attempt} for {search_query}\n")
        if error:
            log_file.write(f"Error: {error}\n")

def search_followers(search_query, retries=3):
    for attempt in range(retries):
        try:
            headers = {'User-Agent': get_random_user_agent()}
            search_url = f"https://www.google.com/search?q={search_query}"
            print(f"Searching for {search_query}: {search_url}")
            response = requests.get(search_url, headers=headers, timeout=10)
            
            log_request_info(search_query, attempt + 1)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Using a flexible search for divs with class names that match the pattern
                result_elements = soup.find_all('div', class_=re.compile(r'VwiC3b.*yXK7lf.*r025kc.*hJNv6b'))
                
                for element in result_elements:
                    spans = element.find_all('span')
                    for span in spans:
                        span_text = span.get_text()
                        if "Followers" in span_text or "seguidores" in span_text:
                            match = re.search(r'(\d+(?:,\d+)*M|\d+(?:,\d+)*K|\d+(?:,\d+)*)', span_text)
                            if match:
                                followers = match.group(1)
                                followers = followers.replace(',', '')  # Clean the followers count
                                return followers
                
                print(f"Followers count not found in search results for {search_query}.")
                return None
            elif response.status_code == 429:
                print(f"Rate limited for {search_query}, status code: {response.status_code}")
                time.sleep(random.uniform(30, 60)) 
            else:
                print(f"Failed to retrieve search results for {search_query}, status code: {response.status_code}")
        
        except Exception as e:
            print(f"An error occurred for {search_query} on attempt {attempt + 1}: {e}")
            log_request_info(search_query, attempt + 1, error=str(e))
            if attempt < retries - 1:
                time.sleep(random.uniform(2, 5) * (2 ** attempt))  
            else:
                return None

def get_instagram_followers_from_google(name):
    followers = search_followers(f"{name} site:instagram.com")
    if not followers:
        print(f"Retrying with a different query for {name}")
        followers = search_followers(f"{name} Instagram")
    return followers

def process_names(names, start_index=0):
    results = []
    for i in range(start_index, len(names)):
        name = names[i]
        followers = get_instagram_followers_from_google(name)
        results.append((name, followers))
        time.sleep(random.uniform(10, 30))  
        
        # Save progress every 2 names
        if (i + 1) % 2 == 0:
            temp_df = pd.DataFrame(results, columns=['name', 'Instagram Followers'])
            temp_df.to_csv('temp_results.csv', index=False)
            print(f"Progress saved at index {i + 1}")
    
    return results

# Load the Excel file
file_path = r'Youtube and Instagram Data/data_final_actores_full.xlsx'
df = pd.read_excel(file_path)

names = df['name'].tolist()

# Select names to scrap
names = names[:10]
# the limit for request is low, aproximatelly 80 names. So i did it in chuncks of 50 save it to a different excel and then join them manually.

if os.path.exists('temp_results.xlsx'):
    temp_df = pd.read_excel('temp_results.xlsx')
    processed_names = temp_df['name'].tolist()
    start_index = len(processed_names)
else:
    start_index = 0

followers_list = process_names(names[start_index:], start_index)

# Combine with previously processed data if any
if start_index > 0:
    previous_df = pd.read_excel('temp_results.xlsx')
    followers_df = pd.concat([previous_df, pd.DataFrame(followers_list, columns=['name', 'Instagram Followers'])], ignore_index=True)
else:
    followers_df = pd.DataFrame(followers_list, columns=['name', 'Instagram Followers'])

# Merge with original dataframe
df = df.merge(followers_df, on='name', how='left')

# Save back to an excel
output_file_path = r'Youtube and Instagram Data/data_final_actores_full_2.xlsx'
df.to_excel(output_file_path, index=False)

print("Finished processing and saved to", output_file_path)
