In [3]:
import time, re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime

# ===== KONFIGURASI =====
MAX_REELS = 5  # Ubah sesuai kebutuhan (10, 20, 50, 100, dst)
username_target = 'batikula'  # Ubah username target

def parse_number(text):
    """Konversi string angka (1.2M, 500K, 1.5RB) ke integer"""
    try:
        if not text:
            return 0
        
        text = str(text).strip().replace(',', '').replace('.', '')
        text_upper = text.upper()
        
        # Format Indonesia
        if 'JT' in text_upper:
            return int(float(text_upper.replace('JT', '')) * 1000000)
        elif 'RB' in text_upper:
            return int(float(text_upper.replace('RB', '')) * 1000)
        # Format International
        elif 'M' in text_upper:
            return int(float(text_upper.replace('M', '')) * 1000000)
        elif 'K' in text_upper:
            return int(float(text_upper.replace('K', '')) * 1000)
        else:
            # Extract hanya angka
            numbers = re.findall(r'\d+', text)
            if numbers:
                return int(''.join(numbers))
            return 0
    except Exception as e:
        print(f"‚ùå Error parsing '{text}': {e}")
        return 0

def get_element_text_safe(driver, selectors, default=''):
    """Safely get text from element using multiple selectors"""
    for selector in selectors:
        try:
            element = driver.find_element(By.CSS_SELECTOR, selector)
            text = element.text.strip()
            if text:
                return text
        except:
            continue
    return default

def extract_likes_from_page(driver):
    """Extract likes count from current reel page"""
    likes = 0
    
    # Multiple strategies to find likes
    strategies = [
        # Strategy 1: Look for like button with count
        {
            'selectors': [
                "section._ae5m button span",
                "article section button span",
                "button[aria-label*='like'] span",
                "button[aria-label*='suka'] span"
            ]
        },
        # Strategy 2: Look for likes text patterns
        {
            'selectors': ["span"]
        }
    ]
    
    for strategy in strategies:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, ", ".join(strategy['selectors']))
            for element in elements:
                text = element.text.strip()
                
                # Skip if empty or contains view indicators
                if not text or any(word in text.lower() for word in ['view', 'tayangan', 'penayangan']):
                    continue
                
                # Check if contains numbers
                if any(char.isdigit() for char in text) or any(suffix in text.upper() for suffix in ['K', 'M', 'RB', 'JT']):
                    parsed = parse_number(text)
                    if parsed > likes:  # Take the highest reasonable number
                        likes = parsed
                        
        except Exception as e:
            continue
    
    return likes

def extract_views_from_page(driver):
    """Extract views count from current reel page"""
    views = 0
    
    try:
        # Look for view indicators in all spans
        all_spans = driver.find_elements(By.TAG_NAME, 'span')
        for span in all_spans:
            text = span.text.strip().lower()
            if any(indicator in text for indicator in ['view', 'tayangan', 'penayangan', 'views']):
                # Extract number from view text
                numbers = re.findall(r'[\d,KkMmRrBbJjTt.]+', text)
                if numbers:
                    views = parse_number(numbers[0])
                    break
    except Exception as e:
        print(f"‚ö†Ô∏è Error getting views: {e}")
    
    return views

def extract_comments_count(driver):
    """Extract total comments count"""
    comments_count = 0
    
    try:
        # Look for comment button or indicators
        selectors = [
            "button[aria-label*='comment'] span",
            "button[aria-label*='komentar'] span",
            "section._ae5m button:nth-child(2) span"
        ]
        
        for selector in selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    text = element.text.strip()
                    if text and any(char.isdigit() for char in text):
                        comments_count = parse_number(text)
                        if comments_count > 0:
                            return comments_count
            except:
                continue
                
        # Alternative: look for comment count in button text
        all_buttons = driver.find_elements(By.TAG_NAME, 'button')
        for button in all_buttons:
            button_text = button.text.strip().lower()
            if any(word in button_text for word in ['comment', 'komentar']) and any(char.isdigit() for char in button_text):
                numbers = re.findall(r'[\d,]+', button_text)
                if numbers:
                    comments_count = parse_number(numbers[0])
                    break
                    
    except Exception as e:
        print(f"‚ö†Ô∏è Error getting comment count: {e}")
    
    return comments_count

# === Setup Driver ===
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

print(f"üéØ Target: @{username_target}")
print(f"üìä Jumlah Reels yang akan di-scrape: {MAX_REELS}")

# === Login Manual ===
driver.get('https://www.instagram.com/')
print("üîê Silakan login secara manual. Tunggu 30 detik...")
time.sleep(30)

# === Profil Target ===
profile_url = f'https://www.instagram.com/{username_target}/'
driver.get(profile_url)
print(f"\nüîç Mengakses profil: {profile_url}")

try:
    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "main")))
    time.sleep(5)
    print("‚úÖ Profil berhasil dimuat")
except:
    print("‚ùå Gagal memuat profil")
    driver.quit()
    exit()

# === Ambil data profil ===
print("\nüìå Mengambil data profil...")
username = username_target
display_name, kategori_profil, bio, tautan = 'N/A', 'N/A', 'N/A', 'N/A'
total_posts, total_followers, total_following = 0, 0, 0

# Display name dengan multiple selectors
display_name_selectors = [
    "header section div h2",
    "header section h2", 
    "section.xc3tme8 > div:nth-child(1) span",
    "h2.x1lliihq"
]
display_name = get_element_text_safe(driver, display_name_selectors, username_target)

# Kategori profil
try:
    kategori_profil = driver.find_element(By.CSS_SELECTOR, "div._ap3a._aaco._aacu._aacy._aad6._aade").text
except: 
    print("‚ö†Ô∏è Kategori profil tidak ditemukan.")

# Tautan
try:
    tautan = driver.find_element(By.CSS_SELECTOR, "a[href*='l.instagram.com']").text
except: 
    print("‚ö†Ô∏è Tautan tidak ditemukan.")

# Bio
try:
    bio_elements = driver.find_elements(By.CSS_SELECTOR, "span._ap3a")
    for el in bio_elements:
        txt = el.text.strip()
        if len(txt) > 10 and ('@' in txt or 'WA' in txt or 'shopee' in txt.lower() or 'instagram' in txt.lower()):
            bio = txt.replace('\n', ' ')
            break
except: 
    print("‚ö†Ô∏è Bio tidak ditemukan.")

# Statistik profil
try:
    stats = driver.find_elements(By.CSS_SELECTOR, "ul.x78zum5 > li.xl565be, header section ul li")
    for stat in stats:
        text = stat.text.lower()
        if 'post' in text:
            total_posts = parse_number(text.split(' ')[0])
        elif 'follower' in text:
            try: 
                total_followers = parse_number(stat.find_element(By.CSS_SELECTOR, "span[title]").get_attribute('title'))
            except: 
                total_followers = parse_number(text.split(' ')[0])
        elif 'following' in text:
            total_following = parse_number(text.split(' ')[0])
except: 
    print("‚ö†Ô∏è Gagal mengambil statistik.")

print(f"‚úÖ Username        : {username}")
print(f"‚úÖ Display Name    : {display_name}")
print(f"‚úÖ Kategori        : {kategori_profil}")
print(f"‚úÖ Bio             : {bio}")
print(f"‚úÖ Tautan          : {tautan}")
print(f"üìä Statistik       : {total_posts} posts, {total_followers} followers, {total_following} following")

# === Klik Tab Reels ===
print("\n‚û°Ô∏è Menuju tab Reels...")
try:
    reels_tab_selectors = [
        "a[href*='/reels/']",
        "div[role='tablist'] a[href*='/reels/']"
    ]
    
    reels_tab = None
    for selector in reels_tab_selectors:
        try:
            reels_tab = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
            )
            break
        except:
            continue
    
    if reels_tab:
        reels_tab.click()
        time.sleep(5)
        print("‚úÖ Tab Reels berhasil dibuka")
    else:
        # Fallback: direct URL
        driver.get(f"https://www.instagram.com/{username_target}/reels/")
        time.sleep(5)
        
except Exception as e:
    print(f"‚ö†Ô∏è Menggunakan direct URL untuk reels: {e}")
    driver.get(f"https://www.instagram.com/{username_target}/reels/")
    time.sleep(5)

# === Scroll dan Ambil Link Reels ===
print(f"\nüîÅ Mengumpulkan {MAX_REELS} reels...")
reels_summary_data, scraped_links = [], set()
scroll_attempts = 0
max_scroll_attempts = 20

while len(reels_summary_data) < MAX_REELS and scroll_attempts < max_scroll_attempts:
    scroll_attempts += 1
    
    # Cari semua link reels
    reel_links = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/reel/"]')
    
    for link_element in reel_links:
        if len(reels_summary_data) >= MAX_REELS:
            break
            
        link = link_element.get_attribute("href")
        if not link or link in scraped_links:
            continue

        # Coba ambil views dari thumbnail (jika ada)
        views_from_thumbnail = ''
        try:
            view_element = link_element.find_element(By.CSS_SELECTOR, "div._aajy span > span, span.x1lliihq, div._ac2a span")
            views_from_thumbnail = view_element.text.strip()
        except:
            pass

        print(f"üéûÔ∏è {len(reels_summary_data)+1}. {link} | Thumbnail Views: {views_from_thumbnail}")
        reels_summary_data.append({
            'url_reel': link, 
            'views_thumbnail': views_from_thumbnail
        })
        scraped_links.add(link)

    if len(reels_summary_data) < MAX_REELS:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        print(f"   Scroll {scroll_attempts}: {len(reels_summary_data)} reels ditemukan")

reels_to_process = reels_summary_data[:MAX_REELS]
print(f"\n‚úÖ Total reels yang akan diproses: {len(reels_to_process)}")

# === Tahap Kunjungan Masing-masing Reel ===
print(f"\nüîç Mengambil detail dari {len(reels_to_process)} Reels...")
all_data = []

for idx, reel in enumerate(reels_to_process):
    link = reel['url_reel']
    views_thumbnail = reel['views_thumbnail']
    id_reel = link.split('/')[-2] if '/' in link else ''

    print(f"\n‚û°Ô∏è {idx+1}/{len(reels_to_process)} - Processing: {link}")
    driver.get(link)
    
    try:
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "article")))
        time.sleep(5)
    except:
        print("‚ö†Ô∏è Halaman tidak termuat, skip.")
        continue

    # === EXTRACT DATA ===
    caption, hashtags, upload_time = '', '', ''
    likes, views, comments_count_total = 0, 0, 0
    comments_list = []

    # Caption + Hashtag
    caption_selectors = [
        "article div h1",
        "div._a9zs h1", 
        "h1"
    ]
    
    try:
        caption_full = get_element_text_safe(driver, caption_selectors)
        if caption_full:
            hashtags_found = re.findall(r"#\w+", caption_full)
            hashtags = ' '.join(hashtags_found)
            caption = re.sub(r'#\w+\s*', '', caption_full).strip()
    except Exception as e:
        print(f"‚ö†Ô∏è Caption error: {e}")

    # Upload time
    try:
        time_element = driver.find_element(By.TAG_NAME, 'time')
        upload_time = time_element.get_attribute('datetime')
        if not upload_time:
            upload_time = time_element.get_attribute('title')
    except: 
        pass

    # === EXTRACT ENGAGEMENT METRICS ===
    print("üìä Mengambil likes, views, comments...")
    
    # Likes
    likes = extract_likes_from_page(driver)
    
    # Views
    views = extract_views_from_page(driver)
    # Fallback ke thumbnail views jika tidak ditemukan
    if views == 0 and views_thumbnail:
        views = parse_number(views_thumbnail)
    
    # Comments count
    comments_count_total = extract_comments_count(driver)

    print(f"   üìà Likes: {likes:,} | Views: {views:,} | Comments: {comments_count_total:,}")

    # === EXTRACT COMMENTS ===
    print("üí¨ Mengambil komentar...")
    
    # Try to load more comments
    load_attempts = 3
    for attempt in range(load_attempts):
        try:
            load_more_selectors = [
                "//button[.//*[contains(@aria-label, 'Load more comments')]]",
                "//button[.//*[contains(@aria-label, 'Muat komentar lainnya')]]",
                "//button[contains(text(), 'Load more')]",
                "//button[contains(text(), 'Muat')]"
            ]
            
            load_more = None
            for selector in load_more_selectors:
                try:
                    load_more = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, selector)))
                    break
                except:
                    continue
            
            if load_more:
                driver.execute_script("arguments[0].scrollIntoView(); arguments[0].click();", load_more)
                time.sleep(2)
                print(f"   ‚úÖ Load more comments #{attempt+1}")
            else:
                break
                
        except:
            break

    # Extract comments
    try:
        comment_selectors = [
            "ul._a9ym span._aade",
            "div._a9zr span",
            "article div span._aacl._aaco._aacu._aacx._aad7._aade",
            "span[dir='auto']"
        ]
        
        for selector in comment_selectors:
            try:
                comment_elements = driver.find_elements(By.CSS_SELECTOR, selector)
                temp_comments = []
                for el in comment_elements:
                    comment_text = el.text.strip()
                    if comment_text and len(comment_text) > 2 and comment_text not in temp_comments:
                        temp_comments.append(comment_text)
                
                if temp_comments:
                    comments_list = temp_comments
                    break
            except:
                continue
                
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting comments: {e}")
    
    final_comments = comments_list[:50]  # Limit to 50 comments per reel
    comments_scraped = len(final_comments)
    print(f"‚úÖ Komentar berhasil diambil: {comments_scraped} dari total {comments_count_total}")

    # === Simpan ke list ===
    data_profil = [
        username, display_name, kategori_profil, bio, tautan, 
        total_posts, total_followers, total_following
    ]
    
    if not final_comments:
        all_data.append(data_profil + [
            id_reel, link, caption, hashtags, likes, views, 
            comments_count_total, comments_scraped, upload_time, ''
        ])
    else:
        for comment in final_comments:
            all_data.append(data_profil + [
                id_reel, link, caption, hashtags, likes, views,
                comments_count_total, comments_scraped, upload_time, comment
            ])

    time.sleep(2)  # Pause between reels

# === Simpan ke Excel ===
df = pd.DataFrame(all_data, columns=[
    'username', 'display_name', 'kategori_profil', 'bio', 'tautan',
    'total_posts', 'total_followers', 'total_following',
    'id_reel', 'url_reel', 'caption', 'hashtags', 
    'likes', 'views', 'comments_count_total', 'comments_scraped',
    'upload_time', 'comment'
])

# Generate filename with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = f'hasil_scrape_reels_{username}_{MAX_REELS}reels_{timestamp}.xlsx'
df.to_excel(output_file, index=False)

print(f"\nüéâ SCRAPING SELESAI!")
print(f"üìÅ File disimpan: {output_file}")
print(f"üìä Total baris data: {len(df):,}")

# Summary statistics
if len(df) > 0:
    print(f"\nüìà RINGKASAN STATISTIK:")
    print(f"   Total Reels: {len(reels_to_process)}")
    print(f"   Rata-rata Likes: {df['likes'].mean():,.0f}")
    print(f"   Rata-rata Views: {df['views'].mean():,.0f}")
    print(f"   Rata-rata Comments: {df['comments_count_total'].mean():,.0f}")
    print(f"   Total Likes: {df['likes'].sum():,}")
    print(f"   Total Views: {df['views'].sum():,}")
    print(f"   Total Comments: {df['comments_count_total'].sum():,}")

driver.quit()
print("\n‚úÖ Browser ditutup. Terima kasih!")

üéØ Target: @batikula
üìä Jumlah Reels yang akan di-scrape: 5
üîê Silakan login secara manual. Tunggu 30 detik...

üîç Mengakses profil: https://www.instagram.com/batikula/
‚úÖ Profil berhasil dimuat

üìå Mengambil data profil...
‚úÖ Username        : batikula
‚úÖ Display Name    : batikula
‚úÖ Kategori        : Clothing (Brand)
‚úÖ Bio             : Katalog detail produk: @batikula_katalog WA: 0812 9496 2018 Shopee Indonesia ‚§µÔ∏è
‚úÖ Tautan          : shopee.co.id/batikula
üìä Statistik       : 1666 posts, 245478 followers, 281 following

‚û°Ô∏è Menuju tab Reels...
‚úÖ Tab Reels berhasil dibuka

üîÅ Mengumpulkan 5 reels...
üéûÔ∏è 1. https://www.instagram.com/batikula/reel/DMxQwhuzH5f/ | Thumbnail Views: 
üéûÔ∏è 2. https://www.instagram.com/batikula/reel/DMr0fegzV4-/ | Thumbnail Views: 
üéûÔ∏è 3. https://www.instagram.com/batikula/reel/DMl5_OLTqCz/ | Thumbnail Views: 
üéûÔ∏è 4. https://www.instagram.com/batikula/reel/DMjj_syzHqa/ | Thumbnail Views: 
   Scroll 1: 4 reels di