In [None]:
import time
import re
import json
import csv
import os
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options


class InstagramMonitor:
    def __init__(self, username, password):
        self.username = username
        self.password = password
        self.options = Options()
        self.options.add_argument('--headless')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('--disable-gpu')
        self.options.add_argument('--window-size=1920x1080')
        
        self.driver = None
        self.state_file = "monitor_state.json"
        
    def initialize_driver(self):
        """Initialize and return a new webdriver instance"""
        self.driver = webdriver.Chrome(options=self.options)
        return self.driver
        
    def login(self):
        """Log in to Instagram"""
        self.driver.get("https://www.instagram.com/")
        time.sleep(5)
        
        # Accept cookies if the dialog appears
        try:
            cookie_button = WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//button[contains(text(), 'Accept') or contains(text(), 'Allow')]"))
            )
            cookie_button.click()
            time.sleep(2)
        except:
            print("No cookie dialog found or already accepted")
        
        try:
            # Enter username
            username_input = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.NAME, "username"))
            )
            username_input.send_keys(self.username)
            
            # Enter password
            password_input = self.driver.find_element(By.NAME, "password")
            password_input.send_keys(self.password)
            
            # Click login
            login_button = self.driver.find_element(By.XPATH, "//button[@type='submit']")
            login_button.click()
            time.sleep(5)
            
            # Handle "Save Login Info" dialog
            try:
                not_now_button = WebDriverWait(self.driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not Now') or contains(text(), 'Not now')]"))
                )
                not_now_button.click()
                time.sleep(2)
            except:
                print("No 'Save Login Info' dialog found")
            
            # Handle notifications dialog
            try:
                not_now_button = WebDriverWait(self.driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not Now') or contains(text(), 'Not now')]"))
                )
                not_now_button.click()
                time.sleep(2)
            except:
                print("No notifications dialog found")
                
            print("Successfully logged in")
            return True
                
        except Exception as e:
            print(f"Login failed: {str(e)}")
            return False
    
    def load_state(self, target_username):
        """Load the previous state to avoid duplicate scraping"""
        try:
            if os.path.exists(self.state_file):
                with open(self.state_file, "r") as f:
                    state = json.load(f)
                    if target_username in state:
                        return state[target_username]
            return {"last_post_ids": [], "last_check": None}
        except Exception as e:
            print(f"Error loading state: {str(e)}")
            return {"last_post_ids": [], "last_check": None}
    
    def save_state(self, target_username, state_data):
        """Save the current state to track what's been scraped"""
        try:
            if os.path.exists(self.state_file):
                with open(self.state_file, "r") as f:
                    state = json.load(f)
            else:
                state = {}
                
            state[target_username] = state_data
            
            with open(self.state_file, "w") as f:
                json.dump(state, f, indent=4)
        except Exception as e:
            print(f"Error saving state: {str(e)}")
    
    def get_latest_post_urls(self, target_username, max_posts=20):
        """Get the URLs of the latest posts"""
        self.driver.get(f"https://www.instagram.com/{target_username}/")
        time.sleep(5)
        
        # Scroll down a bit to load more posts
        for _ in range(3):
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
            time.sleep(2)
        
        # Collect post links (both normal posts and reels)
        post_urls = []
        
        # Get regular posts links
        post_elements = self.driver.find_elements(By.XPATH, "//a[contains(@href, '/p/')]")
        for element in post_elements:
            post_urls.append(element.get_attribute('href'))
            
        # Get reels links
        reel_elements = self.driver.find_elements(By.XPATH, "//a[contains(@href, '/reel/')]")
        for element in reel_elements:
            post_urls.append(element.get_attribute('href'))
        
        # Remove duplicates and limit to max_posts
        post_urls = list(set(post_urls))[:max_posts]
        
        return post_urls
    
    def get_post_details(self, post_url):
        """Get details for a specific post"""
        # Extract post ID from URL
        match = re.search(r'/(p|reel)/([^/]+)/', post_url)
        if not match:
            return None
            
        post_id = match.group(2)
        post_type = match.group(1)  # 'p' for post, 'reel' for reel
        
        # Visit the post page
        self.driver.get(post_url)
        time.sleep(3)
        
        try:
            # Try different selectors as Instagram's HTML structure changes frequently
            caption_selectors = [
                "//div[contains(@class, '_a9zs')]/span",  # Common caption container
                "//h1",  # Sometimes captions are in h1
                "//div[contains(@class, '_a9zs')]",  # Full caption container
                "//article//span[contains(text(), '')]"  # Any text in the article
            ]
            
            caption = ""
            for selector in caption_selectors:
                try:
                    caption_elements = self.driver.find_elements(By.XPATH, selector)
                    if caption_elements:
                        caption = caption_elements[0].text
                        if caption:
                            break
                except:
                    continue
                    
            # Check for empty caption (might be just emojis)
            if not caption:
                # Try getting emojis or any content with a broader selector
                try:
                    emoji_elements = self.driver.find_elements(By.XPATH, "//div[contains(@class, '_a9zs')]/*")
                    if emoji_elements:
                        for el in emoji_elements:
                            emoji_text = el.get_attribute('innerText') or el.get_attribute('textContent') or ""
                            caption += emoji_text
                except:
                    pass
            
            if not caption:
                caption = "No caption found"
                
            # Get timestamp if available
            timestamp = ""
            try:
                time_element = self.driver.find_element(By.XPATH, "//time")
                timestamp = time_element.get_attribute("datetime")
            except:
                timestamp = datetime.now().isoformat()


            image_url = ""
            try:   
               img_element = self.driver.find_element(By.XPATH, "//article//img")
               image_url = img_element.get_attribute("src")
            except:
               image_url = "Image not found"
                
            return {
                "post_id": post_id,
                "type": post_type,
                "url": post_url,
                "caption": caption,
                "timestamp": timestamp,
                "image_url": image_url,
                "scraped_at": datetime.now().isoformat()
                
            }
            
        except Exception as e:
            print(f"Error getting post details for {post_url}: {str(e)}")
            return {
                "post_id": post_id,
                "type": post_type,
                "url": post_url,
                "error": str(e),
                "scraped_at": datetime.now().isoformat()
            }
    
    def posts_generator(self, target_username, max_posts=20, only_new=True):
        """Generator function to yield posts one by one"""
        # Get current state to know what posts we've already scraped
        state = self.load_state(target_username)
        last_post_ids = set(state["last_post_ids"])
        
        # Get all current post URLs
        current_post_urls = self.get_latest_post_urls(target_username, max_posts)
        
        # Extract post IDs from URLs
        current_post_ids = []
        for url in current_post_urls:
            match = re.search(r'/(p|reel)/([^/]+)/', url)
            if match:
                current_post_ids.append(match.group(2))
        
        # Determine which posts to scrape
        if only_new:
            # Only scrape posts we haven't seen before
            posts_to_scrape = [(url, post_id) for url, post_id in zip(current_post_urls, current_post_ids) 
                             if post_id not in last_post_ids]
        else:
            # Scrape all posts regardless of whether we've seen them
            posts_to_scrape = list(zip(current_post_urls, current_post_ids))
        
        if posts_to_scrape:
            print(f"Found {len(posts_to_scrape)} {'new ' if only_new else ''}posts to scrape")
            
            # Update state with current posts for next run
            state["last_post_ids"] = current_post_ids
            state["last_check"] = datetime.now().isoformat()
            self.save_state(target_username, state)
            
            # Yield each post's details
            for post_url, post_id in posts_to_scrape:
                post_details = self.get_post_details(post_url)
                if post_details:
                    yield post_details
                time.sleep(2)  # Be gentle with the server
        else:
            print(f"No {'new ' if only_new else ''}posts found for {target_username}")
            
            # Still update the last check time
            state["last_check"] = datetime.now().isoformat()
            self.save_state(target_username, state)
            
    def save_post_data(self, target_username, post):
        """Save the post data to files"""
        # Ensure directory exists
        os.makedirs("scraped_data", exist_ok=True)
        
        # Append to JSON file
        json_file = f"scraped_data/{target_username}_posts.json"
        
        if os.path.exists(json_file):
            with open(json_file, "r", encoding="utf-8") as f:
                try:
                    posts = json.load(f)
                except:
                    posts = []
        else:
            posts = []
        
        posts.append(post)
        
        with open(json_file, "w", encoding="utf-8") as f:
            json.dump(posts, f, ensure_ascii=False, indent=4)
        
        # Append to CSV file
        csv_file = f"scraped_data/{target_username}_posts.csv"
        file_exists = os.path.exists(csv_file)
        
        with open(csv_file, "a", newline="", encoding="utf-8") as csvfile:
            fieldnames = ["post_id", "type", "url", "caption", "timestamp", "image_url", "scraped_at"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            if not file_exists:
                writer.writeheader()
            
            # Filter to include only the fields defined in fieldnames
            filtered_post = {key: post.get(key, "") for key in fieldnames}
            writer.writerow(filtered_post)
    
    def run_initial_scrape(self, target_username, max_posts=20):
        """Run an initial scrape to establish a baseline"""
        print(f"Running initial scrape for {target_username}...")
        
        post_count = 0
        for post in self.posts_generator(target_username, max_posts, only_new=False):
            print(f"Scraped post: {post['post_id']}")
            print(f"  Caption: {post['caption'][:50]}{'...' if len(post['caption']) > 50 else ''}")
            self.save_post_data(target_username, post)
            post_count += 1
            
        print(f"Initial scrape complete. Scraped {post_count} posts.")
    
    def monitor_profile(self, target_username, interval_minutes=5, max_posts=20, run_initial=True):
        """Continuously monitor a profile for new posts"""
        try:
            self.initialize_driver()
            
            if not self.login():
                print("Failed to login. Aborting monitoring.")
                self.driver.quit()
                return
            
            print(f"Starting continuous monitoring of {target_username}")
            print(f"Will check for new posts every {interval_minutes} minutes")
            
            # Run initial scrape if requested
            if run_initial:
                self.run_initial_scrape(target_username, max_posts)
            
            try:
                while True:
                    next_check_time = datetime.now().timestamp() + (interval_minutes * 60)
                    next_check_str = datetime.fromtimestamp(next_check_time).strftime('%Y-%m-%d %H:%M:%S')
                    
                    print(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Checking for new posts...")
                    
                    post_count = 0
                    for post in self.posts_generator(target_username, max_posts, only_new=True):
                        print(f"New post found: {post['post_id']}")
                        print(f"  Caption: {post['caption'][:50]}{'...' if len(post['caption']) > 50 else ''}")
                        self.save_post_data(target_username, post)
                        post_count += 1
                    
                    print(f"Found {post_count} new posts")
                    print(f"Next check scheduled for: {next_check_str}")
                    
                    time.sleep(interval_minutes * 60)
                    
            except KeyboardInterrupt:
                print("\nMonitoring stopped by user")
                
        except Exception as e:
            print(f"Error in monitoring: {str(e)}")
            
        finally:
            if self.driver:
                self.driver.quit()


if __name__ == "__main__":
    # Instagram credentials
    INSTAGRAM_USERNAME = ""
    INSTAGRAM_PASSWORD = ""
    
    # Target profile to monitor
    TARGET_USERNAME = ""
    
    # Create the monitor
    monitor = InstagramMonitor(INSTAGRAM_USERNAME, INSTAGRAM_PASSWORD)
    
    # Start monitoring
    # Adjust these parameters as needed:
    # - interval_minutes: How often to check for new posts (default: 5 minutes)
    # - max_posts: Maximum number of posts to check each time (default: 20)
    # - run_initial: Whether to do an initial scrape on startup (default: True)
    monitor.monitor_profile(TARGET_USERNAME, interval_minutes=3, max_posts=10, run_initial=True)

No cookie dialog found or already accepted
No 'Save Login Info' dialog found
No notifications dialog found
Successfully logged in
Starting continuous monitoring of insta_project_69
Will check for new posts every 3 minutes
Running initial scrape for insta_project_69...
Found 5 posts to scrape
Scraped post: DIR1lUoq4xh
  Caption: Finally dekhte hai work hota hai ki nhi 💀🎀💦📈😭😂😭🎉🤷😡...
Scraped post: DIRyc6tKDRi
  Caption: Usee kisi aur sath dekhaa tab pataa challa... Moha...
Scraped post: DIRySicqAXk
  Caption: Abeyy yrrr valorant ko thoda pehle karne nhi huaaa...
Scraped post: DIRyLygKnEv
  Caption: I found a girlllll , beautiful and sweetttt aage k...
Scraped post: DIRy0SYK4bo
  Caption: Abbb pataa chalegaaa project asli mein kaam karta ...
Initial scrape complete. Scraped 5 posts.

[2025-04-11 02:01:53] Checking for new posts...
No new posts found for insta_project_69
Found 0 new posts
Next check scheduled for: 2025-04-11 02:04:53
