In [None]:
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import re
import math
from datetime import datetime

def parse_number(text):
    """
    Fungsi untuk mengonversi string angka (seperti '1.2M', '500K') ke integer
    """
    try:
        if not text:
            return 0
        
        text = str(text).strip().replace(',', '').replace('.', '')
        
        if 'M' in text.upper():
            return int(float(text.upper().replace('M', '')) * 1000000)
        elif 'K' in text.upper():
            return int(float(text.upper().replace('K', '')) * 1000)
        elif 'RB' in text.upper():  # Indonesia format
            return int(float(text.upper().replace('RB', '')) * 1000)
        elif 'JT' in text.upper():  # Indonesia format  
            return int(float(text.upper().replace('JT', '')) * 1000000)
        else:
            # Extract only numbers
            numbers = re.findall(r'\d+', text)
            if numbers:
                return int(''.join(numbers))
            return 0
    except Exception as e:
        print(f"Error parsing number '{text}': {e}")
        return 0

def extract_hashtags_and_clean_caption(caption_text):
    """
    Fungsi untuk memisahkan hashtag dari caption
    Returns: (clean_caption, hashtags_string)
    """
    try:
        if not caption_text:
            return '', ''
        
        # Mencari semua hashtag dengan regex
        hashtags = re.findall(r'#\w+', caption_text)
        
        # Membuat string hashtag yang dipisahkan koma
        hashtags_string = ', '.join(hashtags) if hashtags else ''
        
        # Menghapus hashtag dari caption untuk mendapatkan caption bersih
        clean_caption = re.sub(r'#\w+', '', caption_text)
        
        # Membersihkan spasi berlebih
        clean_caption = ' '.join(clean_caption.split())
        
        return clean_caption.strip(), hashtags_string
        
    except Exception as e:
        print(f"‚ùå Error dalam extract_hashtags_and_clean_caption: {e}")
        return caption_text, ''

def scroll_and_wait(driver, pause_time=2):
    """Helper function to scroll and wait"""
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(pause_time)

def get_element_text_safe(driver, selectors, default=''):
    """Safely get text from element using multiple selectors"""
    for selector in selectors:
        try:
            element = driver.find_element(By.CSS_SELECTOR, selector)
            text = element.text.strip()
            if text:
                return text
        except:
            continue
    return default

def get_attribute_safe(driver, selectors, attribute, default=''):
    """Safely get attribute from element using multiple selectors"""
    for selector in selectors:
        try:
            element = driver.find_element(By.CSS_SELECTOR, selector)
            attr_value = element.get_attribute(attribute)
            if attr_value:
                return attr_value
        except:
            continue
    return default

# === Setup Chrome ===
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--lang=en")  # Use English for consistent selectors
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# === Login manual ===
driver.get('https://www.instagram.com/')
print("üîë Silakan login ke akun Instagram Anda di browser yang muncul")
print("Anda memiliki 45 detik untuk login...")
time.sleep(45)

# === Profil target ===
username_target = 'batikkerisonline'
profile_url = f'https://www.instagram.com/{username_target}/'
driver.get(profile_url)

try:
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "main")))
    print("‚úÖ Berhasil membuka profil target")
except TimeoutException:
    print("‚ùå Gagal memuat profil target")
    driver.quit()
    exit()

time.sleep(5)

# === PENGAMBILAN DATA PROFIL LENGKAP ===
print("\nüîç Mengambil data detail profil...")

# -- Username & Nama Pengguna --
username = username_target
display_name = 'N/A'

# Multiple selectors for display name
display_name_selectors = [
    "header section div h2",
    "header section h2",
    "h2.x1lliihq.x1plvlek.xryxfnj.x1n2onr6.x193iq5w.xeuugli.x1fj9vlw.x13faqbe.x1vvkbs.x1s688f.xi81zsa",
    "span.x1lliihq.x1plvlek.xryxfnj.x1n2onr6.x193iq5w.xeuugli.x1fj9vlw.x13faqbe.x1vvkbs.x1s688f.xi81zsa"
]

display_name = get_element_text_safe(driver, display_name_selectors, username_target)
print(f"‚úÖ Username: {username}")
print(f"‚úÖ Nama Pengguna: {display_name}")

# -- Kategori Toko, Bio, Tautan --
kategori_profil = 'N/A'
bio = 'N/A'
tautan = 'N/A'

# Category selectors
category_selectors = [
    "div._ap3a._aaco._aacu._aacy._aad6._aade",
    "div.x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz.x1heor9g.x1sur9pj.xkrqix3.x1lku1pv"
]

kategori_profil = get_element_text_safe(driver, category_selectors)
print(f"‚úÖ Kategori Profil: {kategori_profil}")

# Bio selectors - look for text containing common bio indicators
bio_selectors = [
    "div.-vDIg span",
    "div._ap3a._aaco._aacu._aacx._aad7._aade span",
    "header section div span"
]

for selector in bio_selectors:
    try:
        elements = driver.find_elements(By.CSS_SELECTOR, selector)
        for element in elements:
            text = element.text.strip()
            if len(text) > 10 and any(indicator in text.lower() for indicator in ['wa:', '@', 'shopee', 'tokopedia', 'instagram', 'follow']):
                bio = text.replace('\n', ' ')
                break
        if bio != 'N/A':
            break
    except:
        continue

print(f"‚úÖ Bio: {bio}")

# Link selectors
link_selectors = [
    "a[href*='l.instagram.com']",
    "header section div a[target='_blank']"
]

tautan = get_element_text_safe(driver, link_selectors)
print(f"‚úÖ Tautan: {tautan}")

# -- Statistik (Posts, Followers, Following) --
total_posts, total_followers, total_following = 0, 0, 0

try:
    # Multiple approaches to get statistics
    stats_selectors = [
        "header section ul li",
        "header section div ul li",
        "ul._ac2a li"
    ]
    
    for selector in stats_selectors:
        try:
            stats_elements = driver.find_elements(By.CSS_SELECTOR, selector)
            if len(stats_elements) >= 3:
                # Usually order is: posts, followers, following
                for i, stat in enumerate(stats_elements[:3]):
                    text = stat.text.lower().strip()
                    print(f"Debug stat {i}: {text}")
                    
                    if i == 0 or 'post' in text:  # Posts
                        numbers = re.findall(r'[\d,]+', text)
                        if numbers:
                            total_posts = parse_number(numbers[0])
                    elif i == 1 or 'follower' in text:  # Followers
                        # Try to get title attribute first (shows full number)
                        try:
                            title_element = stat.find_element(By.CSS_SELECTOR, "span[title]")
                            total_followers = parse_number(title_element.get_attribute('title'))
                        except:
                            numbers = re.findall(r'[\d,KkMm.]+', text)
                            if numbers:
                                total_followers = parse_number(numbers[0])
                    elif i == 2 or 'following' in text:  # Following
                        numbers = re.findall(r'[\d,]+', text)
                        if numbers:
                            total_following = parse_number(numbers[0])
                break
        except:
            continue
            
except Exception as e:
    print(f"‚ö†Ô∏è Gagal mengambil statistik profil: {e}")

print(f"üìä Statistik Profil - Posts: {total_posts}, Followers: {total_followers}, Following: {total_following}")

# === NAVIGASI KE TAB REELS ===
print("\nüé¨ Membuka tab Reels...")
try:
    # Multiple selectors for reels tab
    reels_selectors = [
        "a[href*='/reels/']",
        "div[role='tablist'] a[href*='/reels/']",
        "a[aria-label*='Reels']"
    ]
    
    reels_tab = None
    for selector in reels_selectors:
        try:
            reels_tab = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
            )
            break
        except:
            continue
    
    if reels_tab:
        reels_tab.click()
        time.sleep(5)
        print("‚úÖ Berhasil membuka tab Reels")
    else:
        raise Exception("Tidak dapat menemukan tab Reels")
        
except Exception as e:
    print(f"‚ùå Gagal membuka tab Reels: {e}")
    driver.quit()
    exit()

# === Scroll dan ambil reels secara bertahap ===
print("\n‚è≥ Mulai scroll dan ambil reels...")
scroll_times = 20  # Lebih banyak scroll untuk mendapatkan 50 reels
reels_links = set()  # Use set to avoid duplicates
last_height = driver.execute_script("return document.body.scrollHeight")

for i in range(scroll_times):
    scroll_and_wait(driver, 3)
    
    # Multiple selectors for reels
    reel_selectors = [
        'a[href*="/reel/"]',
        'a[href*="/p/"][href*="reel"]',
        'div._ac7v a[href*="/reel/"]'
    ]
    
    for selector in reel_selectors:
        try:
            reels = driver.find_elements(By.CSS_SELECTOR, selector)
            for reel in reels:
                href = reel.get_attribute('href')
                if href and '/reel/' in href:
                    reels_links.add(href)
                if len(reels_links) >= 50:
                    break
        except:
            continue
            
        if len(reels_links) >= 50:
            break
    
    print(f"   Scroll ke-{i+1}: total reels sekarang {len(reels_links)}")
    
    # Check if reached target or bottom
    if len(reels_links) >= 50:
        break
        
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        print("   Telah mencapai bagian bawah halaman.")
        break
    last_height = new_height

reels_links = list(reels_links)[:5]  # Convert to list and limit to 50
print(f"\n‚úÖ Total reels ditemukan: {len(reels_links)}")
print(f"\n‚ÑπÔ∏è  Akan memproses {len(reels_links)} reels.")

# === DATA OUTPUT ===
all_data = []

# === Loop setiap reels ===
for idx, link in enumerate(reels_links):
    print(f"\n--- Memproses Reel {idx+1}/{len(reels_links)}: {link} ---")
    driver.get(link)
    
    try: 
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "article")))
    except TimeoutException:
        print("‚ùå Halaman reel tidak termuat, melewati reel ini.")
        continue
    
    time.sleep(5)

    id_post = link.split('/')[-2] if '/' in link else ''
    media_type = 'reel'
    
    # === PENGAMBILAN DATA REEL ===
    caption, likes, views, media_url, upload_time = '', 0, 0, '', ''
    comments_count_total = 0
    
    # Caption - multiple selectors
    caption_selectors = [
        "article div h1",
        "div._a9zs h1",
        "article span._aacl._aaco._aacu._aacx._aad7._aade",
        "div.C4VMK span",
        "span[dir='auto']"
    ]
    
    caption = get_element_text_safe(driver, caption_selectors)
    print(f"üìù Caption: {caption[:100]}...")
    
    # Likes - Improved selectors and parsing
    like_selectors = [
        "section._ae5m._ae5n._ae5o button span",
        "button[aria-label*='like'] span",
        "section button:first-child span",
        "article section button span",
        "span._ac2a",
        "button span[dir='auto']"
    ]
    
    for selector in like_selectors:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            for element in elements:
                text = element.text.strip()
                if text and (any(char.isdigit() for char in text) or any(suffix in text.upper() for suffix in ['K', 'M', 'RB', 'JT'])):
                    # Skip if it looks like a view count
                    if not any(word in text.lower() for word in ['view', 'tayangan']):
                        likes = parse_number(text)
                        if likes > 0:
                            break
            if likes > 0:
                break
        except:
            continue
    
    # Views - Specific for reels
    view_selectors = [
        "span[aria-label*='view']",
        "span[aria-label*='tayangan']",
        "div._ae5m._ae5n._ae5o span",
        "section span[dir='auto']"
    ]
    
    # Look for views in all spans
    try:
        all_spans = driver.find_elements(By.TAG_NAME, 'span')
        for span in all_spans:
            text = span.text.strip().lower()
            if any(indicator in text for indicator in ['view', 'tayangan', 'penayangan']):
                # Extract number from view text
                numbers = re.findall(r'[\d,KkMmRrBbJjTt.]+', text)
                if numbers:
                    views = parse_number(numbers[0])
                    break
    except Exception as e:
        print(f"‚ö†Ô∏è Gagal mengambil views: {e}")
    
    # Comments count - look for comment indicators
    comment_count_selectors = [
        "button[aria-label*='comment'] span",
        "section._ae5m._ae5n._ae5o button:nth-child(2) span",
        "button span[dir='auto']"
    ]
    
    try:
        # Look for comment button or comment count indicators
        all_buttons = driver.find_elements(By.TAG_NAME, 'button')
        for button in all_buttons:
            button_text = button.text.strip().lower()
            if any(indicator in button_text for indicator in ['comment', 'komentar']) and any(char.isdigit() for char in button_text):
                numbers = re.findall(r'[\d,]+', button_text)
                if numbers:
                    comments_count_total = parse_number(numbers[0])
                    break
    except Exception as e:
        print(f"‚ö†Ô∏è Gagal mengambil comment count: {e}")
    
    # Media URL (video)
    media_url_selectors = ['video', 'video source']
    
    for selector in media_url_selectors:
        try:
            video_element = driver.find_element(By.TAG_NAME, selector)
            media_url = video_element.get_attribute('src')
            if media_url:
                break
        except:
            continue
    
    # Upload time
    time_selectors = ['time', 'time[datetime]']
    
    for selector in time_selectors:
        try:
            time_element = driver.find_element(By.CSS_SELECTOR, selector)
            upload_time = time_element.get_attribute('datetime')
            if not upload_time:
                upload_time = time_element.get_attribute('title')
            if upload_time:
                break
        except:
            continue
    
    content_category = 'batikkerisonline reel'

    print(f"üìä Reel Stats - Likes: {likes}, Views: {views}, Comments Count: {comments_count_total}")

    # === PEMISAHAN HASHTAG DARI CAPTION ===
    clean_caption, hashtags = extract_hashtags_and_clean_caption(caption)
    print(f"üìù Caption bersih: {clean_caption[:50]}...")
    print(f"üè∑Ô∏è Hashtags: {hashtags}")

    # === Memuat Komentar Detail ===
    print("‚è≥ Memuat komentar...")
    comments_list = []
    
    # Try to load comments by scrolling down in the post
    try:
        # Scroll to load more comments
        for scroll_attempt in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
        
        # Multiple selectors for comments
        comment_selectors = [
            "div._a9zr span",
            "article div span._aacl._aaco._aacu._aacx._aad7._aade",
            "div.C4VMK span",
            "ul._a9z6 span._aacl._aaco._aacu._aacx._aad7._aade"
        ]
        
        for selector in comment_selectors:
            try:
                comment_elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in comment_elements:
                    comment_text = element.text.strip()
                    if comment_text and len(comment_text) > 3 and comment_text not in comments_list:
                        comments_list.append(comment_text)
                        if len(comments_list) >= 20:
                            break
                if len(comments_list) >= 20:
                    break
            except:
                continue
                
    except Exception as e: 
        print(f"‚ùå Gagal mengambil komentar: {e}")
    
    final_comments = comments_list[:50]  # Batasi komentar per reel
    comments_count_scraped = len(final_comments)
    print(f"‚úÖ Berhasil mengambil {comments_count_scraped} komentar dari total {comments_count_total}.")
    
    # --- Menyimpan data ke list utama ---
    data_profil = [username, display_name, kategori_profil, bio, tautan, total_posts, total_followers, total_following]
    
    if not final_comments:
        all_data.append(data_profil + [
            id_post, link, clean_caption, hashtags, likes, views, comments_count_total, comments_count_scraped, 
            media_url, media_type, content_category, upload_time, ''
        ])
    else:
        for comment_text in final_comments:
            all_data.append(data_profil + [
                id_post, link, clean_caption, hashtags, likes, views, comments_count_total, comments_count_scraped,
                media_url, media_type, content_category, upload_time, comment_text
            ])
    
    time.sleep(3)  # Longer pause between reels

# === Save ke Excel ===
df = pd.DataFrame(all_data, columns=[
    'username', 'nama_pengguna', 'kategori_profil', 'bio', 'tautan', 
    'total_posts', 'total_followers', 'total_following',
    'id_post', 'url_post', 'caption', 'hashtags', 'likes', 'views', 
    'comments_count_total', 'comments_count_scraped',
    'media_url', 'media_type', 'content_category', 'upload_time', 'comment'
])

output_filename = f"hasil_scrape_reels_{username_target}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
df.to_excel(output_filename, index=False)
print(f"\nüé¨ Selesai! Data reels disimpan ke '{output_filename}'")
print(f"üìä Total data yang berhasil diambil: {len(df)} baris")

# Print summary statistics
print(f"\nüìà Ringkasan Data:")
print(f"Total Reels: {len(reels_links)}")
print(f"Total Likes: {df['likes'].sum()}")
print(f"Total Views: {df['views'].sum()}")
print(f"Total Comments: {df['comments_count_total'].sum()}")
print(f"Average Likes per Reel: {df['likes'].mean():.0f}")
print(f"Average Views per Reel: {df['views'].mean():.0f}")

driver.quit()

üîë Please login to Instagram in the browser window
‚ùå Login failed or took too long
