In [None]:
"""
Job Automation Tool - Fetch and Match Jobs from Multiple Portals
Designed for Google Colab Environment
"""

import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
import random
from urllib.parse import urlencode, quote
import re
from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings('ignore')

In [None]:
class JobAutomationTool:
    def __init__(self, profile_json_path="profile_data.json"):
        """Initialize the job automation tool with user profile data"""
        self.profile_data = self.load_profile_data(profile_json_path)
        self.jobs_data = []
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        ]
        
    def load_profile_data(self, json_path):
        """Load user profile data from JSON file"""
        try:
            with open(json_path, 'r') as f:
                data = json.load(f)
            print(f"✅ Profile loaded successfully!")
            print(f"Skills: {', '.join(data['Skills'])}")
            print(f"Desired Roles: {', '.join(data['Desired Roles'])}")
            print(f"Experience: {data['Experience Level']['Label']} ({data['Experience Level']['Years']} years)")
            print(f"Location: {data['Desired City']}")
            return data
        except Exception as e:
            print(f"❌ Error loading profile data: {e}")
            return None
    
    def get_random_headers(self):
        """Get random headers to avoid detection"""
        return {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
    
    def calculate_match_score(self, job_title, job_description, company):
        """Calculate job match percentage based on skills and roles"""
        if not self.profile_data:
            return 0
            
        score = 0
        total_criteria = 0
        
        # Job title matching (40% weight)
        title_score = 0
        for role in self.profile_data['Desired Roles']:
            similarity = fuzz.partial_ratio(role.lower(), job_title.lower())
            title_score = max(title_score, similarity)
        score += (title_score * 0.4)
        total_criteria += 40
        
        # Skills matching (50% weight)
        skills_found = 0
        job_text = (job_title + " " + job_description).lower()
        for skill in self.profile_data['Skills']:
            if skill.lower() in job_text:
                skills_found += 1
        
        if self.profile_data['Skills']:
            skills_score = (skills_found / len(self.profile_data['Skills'])) * 100
            score += (skills_score * 0.5)
            total_criteria += 50
        
        # Company bonus (10% weight)
        reputed_companies = ['google', 'microsoft', 'amazon', 'apple', 'meta', 'netflix', 'tesla', 'nvidia']
        if any(comp in company.lower() for comp in reputed_companies):
            score += 10
            total_criteria += 10
        
        return min(int(score), 100)
    
    def scrape_naukri_jobs(self, max_jobs=20):
        """Scrape jobs from Naukri.com"""
        print("🔍 Scraping Naukri.com...")
        
        try:
            base_url = "https://www.naukri.com/jobs-in-{}-{}".format(
                self.profile_data['Desired City'].lower(),
                "-".join([role.lower().replace(" ", "-") for role in self.profile_data['Desired Roles'][:2]])
            )
            
            headers = self.get_random_headers()
            response = requests.get(base_url, headers=headers, timeout=10)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                job_cards = soup.find_all(['div', 'article'], class_=re.compile(r'job|result'))[:max_jobs]
                
                for card in job_cards:
                    try:
                        title_elem = card.find(['a', 'h2', 'h3'], class_=re.compile(r'title|job'))
                        company_elem = card.find(['span', 'div'], class_=re.compile(r'company|org'))
                        location_elem = card.find(['span', 'div'], class_=re.compile(r'location|loc'))
                        
                        if title_elem and company_elem:
                            job_title = title_elem.get_text(strip=True)
                            company = company_elem.get_text(strip=True)
                            location = location_elem.get_text(strip=True) if location_elem else self.profile_data['Desired City']
                            
                            # Get job link
                            job_link = title_elem.get('href', '#')
                            if job_link.startswith('/'):
                                job_link = 'https://www.naukri.com' + job_link
                            
                            # Calculate match score
                            match_score = self.calculate_match_score(job_title, job_title, company)
                            
                            if match_score >= 30:  # Only include jobs with >30% match
                                self.jobs_data.append({
                                    'Job Title': job_title,
                                    'Company': company,
                                    'Location': location,
                                    'Match %': f"{match_score}%",
                                    'Apply Link': job_link,
                                    'Date Posted': datetime.now().strftime('%d %B %Y'),
                                    'Source': 'Naukri.com'
                                })
                    except Exception as e:
                        continue
                        
                print(f"✅ Found {len([j for j in self.jobs_data if j['Source'] == 'Naukri.com'])} jobs from Naukri.com")
                
        except Exception as e:
            print(f"❌ Error scraping Naukri: {e}")
    
    def scrape_indeed_jobs(self, max_jobs=20):
        """Scrape jobs from Indeed"""
        print("🔍 Scraping Indeed...")
        
        try:
            # Create search query
            query = "+".join(self.profile_data['Desired Roles'][:2])
            location = self.profile_data['Desired City']
            
            url = f"https://in.indeed.com/jobs?q={quote(query)}&l={quote(location)}&fromage=1"
            
            headers = self.get_random_headers()
            response = requests.get(url, headers=headers, timeout=10)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Indeed job cards
                job_cards = soup.find_all('div', class_=re.compile(r'job_seen_beacon|result|slider_container'))[:max_jobs]
                
                for card in job_cards:
                    try:
                        title_elem = card.find('a', {'data-jk': True}) or card.find('h2').find('a') if card.find('h2') else None
                        company_elem = card.find('span', class_=re.compile(r'companyName'))
                        location_elem = card.find('div', class_=re.compile(r'companyLocation'))
                        
                        if title_elem and company_elem:
                            job_title = title_elem.get_text(strip=True)
                            company = company_elem.get_text(strip=True)
                            location = location_elem.get_text(strip=True) if location_elem else self.profile_data['Desired City']
                            
                            # Get job link
                            job_link = title_elem.get('href', '#')
                            if job_link.startswith('/'):
                                job_link = 'https://in.indeed.com' + job_link
                            
                            # Calculate match score
                            match_score = self.calculate_match_score(job_title, job_title, company)
                            
                            if match_score >= 30:
                                self.jobs_data.append({
                                    'Job Title': job_title,
                                    'Company': company,
                                    'Location': location,
                                    'Match %': f"{match_score}%",
                                    'Apply Link': job_link,
                                    'Date Posted': datetime.now().strftime('%d %B %Y'),
                                    'Source': 'Indeed'
                                })
                    except Exception as e:
                        continue
                        
                print(f"✅ Found {len([j for j in self.jobs_data if j['Source'] == 'Indeed'])} jobs from Indeed")
                
        except Exception as e:
            print(f"❌ Error scraping Indeed: {e}")
    
    def scrape_linkedin_jobs(self, max_jobs=15):
        """Scrape jobs from LinkedIn (limited due to restrictions)"""
        print("🔍 Scraping LinkedIn Jobs...")
        
        try:
            # LinkedIn is heavily protected, so we'll use a basic approach
            keywords = "+".join(self.profile_data['Skills'][:3])
            location = self.profile_data['Desired City']
            
            url = f"https://www.linkedin.com/jobs/search?keywords={quote(keywords)}&location={quote(location)}"
            
            headers = self.get_random_headers()
            headers['Accept'] = 'text/html,application/xhtml+xml'
            
            response = requests.get(url, headers=headers, timeout=15)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Try to find job cards
                job_cards = soup.find_all('div', class_=re.compile(r'job|result'))[:max_jobs]
                
                for card in job_cards:
                    try:
                        title_elem = card.find('a') or card.find('h3')
                        
                        if title_elem:
                            job_title = title_elem.get_text(strip=True)
                            company = "LinkedIn Job"  # Placeholder as LinkedIn blocks detailed scraping
                            location = self.profile_data['Desired City']
                            
                            job_link = title_elem.get('href', '#')
                            if job_link.startswith('/'):
                                job_link = 'https://www.linkedin.com' + job_link
                            
                            match_score = self.calculate_match_score(job_title, job_title, company)
                            
                            if match_score >= 25:
                                self.jobs_data.append({
                                    'Job Title': job_title,
                                    'Company': company,
                                    'Location': location,
                                    'Match %': f"{match_score}%",
                                    'Apply Link': job_link,
                                    'Date Posted': datetime.now().strftime('%d %B %Y'),
                                    'Source': 'LinkedIn'
                                })
                    except Exception as e:
                        continue
                        
                print(f"✅ Found {len([j for j in self.jobs_data if j['Source'] == 'LinkedIn'])} jobs from LinkedIn")
                
        except Exception as e:
            print(f"❌ LinkedIn scraping limited due to restrictions: {e}")
    
    def add_sample_jobs(self):
        """Add sample jobs for demonstration"""
        print("🔍 Adding sample relevant jobs...")
        
        sample_jobs = [
            {
                'Job Title': 'Junior Data Analyst',
                'Company': 'TechCorp India',
                'Location': 'Pune',
                'Match %': '92%',
                'Apply Link': 'https://example.com/job1',
                'Date Posted': datetime.now().strftime('%d %B %Y'),
                'Source': 'Sample Data'
            },
            {
                'Job Title': 'Data Scientist - Entry Level',
                'Company': 'Analytics Pro',
                'Location': 'Pune',
                'Match %': '88%',
                'Apply Link': 'https://example.com/job2',
                'Date Posted': datetime.now().strftime('%d %B %Y'),
                'Source': 'Sample Data'
            },
            {
                'Job Title': 'Risk Analyst - Fresh Graduate',
                'Company': 'FinTech Solutions',
                'Location': 'Pune',
                'Match %': '85%',
                'Apply Link': 'https://example.com/job3',
                'Date Posted': datetime.now().strftime('%d %B %Y'),
                'Source': 'Sample Data'
            },
            {
                'Job Title': 'Business Intelligence Analyst',
                'Company': 'Data Insights Ltd',
                'Location': 'Pune',
                'Match %': '78%',
                'Apply Link': 'https://example.com/job4',
                'Date Posted': datetime.now().strftime('%d %B %Y'),
                'Source': 'Sample Data'
            }
        ]
        
        self.jobs_data.extend(sample_jobs)
        print(f"✅ Added {len(sample_jobs)} sample jobs")
    
    def run_job_search(self):
        """Run the complete job search process"""
        print("🚀 Starting Job Automation Tool...")
        print("=" * 50)
        
        if not self.profile_data:
            print("❌ No profile data loaded. Please check your JSON file.")
            return
        
        # Clear previous data
        self.jobs_data = []
        
        # Scrape different job portals
        try:
            self.scrape_naukri_jobs()
            time.sleep(2)  # Rate limiting
            
            self.scrape_indeed_jobs()
            time.sleep(2)  # Rate limiting
            
            self.scrape_linkedin_jobs()
            time.sleep(2)  # Rate limiting
            
        except Exception as e:
            print(f"⚠️  Some scraping errors occurred: {e}")
        
        # Add sample jobs for demonstration
        self.add_sample_jobs()
        
        # Remove duplicates and sort by match score
        self.remove_duplicates()
        self.sort_jobs_by_match()
        
        # Save to CSV
        self.save_to_csv()
        
        print("\n" + "=" * 50)
        print("✅ Job search completed successfully!")
        print(f"📊 Total jobs found: {len(self.jobs_data)}")
        
        if self.jobs_data:
            avg_match = sum(int(job['Match %'].replace('%', '')) for job in self.jobs_data) / len(self.jobs_data)
            print(f"📈 Average match score: {avg_match:.1f}%")
    
    def remove_duplicates(self):
        """Remove duplicate jobs based on title and company"""
        seen = set()
        unique_jobs = []
        
        for job in self.jobs_data:
            identifier = (job['Job Title'].lower(), job['Company'].lower())
            if identifier not in seen:
                seen.add(identifier)
                unique_jobs.append(job)
        
        self.jobs_data = unique_jobs
        print(f"🔄 Removed duplicates. Unique jobs: {len(self.jobs_data)}")
    
    def sort_jobs_by_match(self):
        """Sort jobs by match percentage (highest first)"""
        self.jobs_data.sort(key=lambda x: int(x['Match %'].replace('%', '')), reverse=True)
    
    def save_to_csv(self, filename=None):
        """Save jobs data to CSV file"""
        if not filename:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"matched_jobs_{timestamp}.csv"
        
        try:
            df = pd.DataFrame(self.jobs_data)
            df.to_csv(filename, index=False)
            print(f"💾 Jobs saved to: {filename}")
            
            # Display top 5 jobs
            if len(df) > 0:
                print("\n🏆 Top 5 Matched Jobs:")
                print("-" * 80)
                for i, row in df.head().iterrows():
                    print(f"{i+1}. {row['Job Title']} at {row['Company']} - {row['Match %']} match")
                    print(f"   📍 {row['Location']} | 🌐 {row['Source']}")
                    print(f"   🔗 {row['Apply Link']}")
                    print("-" * 80)
            
            return filename
            
        except Exception as e:
            print(f"❌ Error saving to CSV: {e}")
            return None
    
    def display_stats(self):
        """Display job search statistics"""
        if not self.jobs_data:
            print("No jobs data available.")
            return
        
        df = pd.DataFrame(self.jobs_data)
        
        print("\n📊 Job Search Statistics:")
        print("=" * 40)
        print(f"Total Jobs Found: {len(df)}")
        print(f"Average Match Score: {df['Match %'].str.replace('%', '').astype(int).mean():.1f}%")
        print(f"Jobs by Source:")
        print(df['Source'].value_counts().to_string())
        print(f"\nTop Companies:")
        print(df['Company'].value_counts().head().to_string())


In [None]:

def main():
    """Main function to run the job automation tool"""
    
    # Install required packages (for Google Colab)
    print("🔧 Installing required packages...")
    import subprocess
    import sys
    
    packages = ['beautifulsoup4', 'requests', 'pandas', 'fuzzywuzzy', 'python-levenshtein']
    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
        except:
            print(f"⚠️  Could not install {package}")
    
    print("✅ Packages installed!")
    print("\n" + "=" * 60)
    print("🎯 JOB AUTOMATION TOOL")
    print("=" * 60)
    
    # Initialize and run the tool
    try:
        tool = JobAutomationTool("profile_data.json")
        tool.run_job_search()
        tool.display_stats()
        
        print("\n🎉 Job automation completed successfully!")
        print("📁 Check your files for the generated CSV with job listings.")
        
    except Exception as e:
        print(f"❌ Error running job automation: {e}")
        print("Please ensure your profile_data.json file is properly formatted.")

if __name__ == "__main__":
    main()

# =============================================================================
# CONFIGURATION SECTION - Edit these values as needed
# =============================================================================

def update_profile_config():
    """Update profile configuration if needed"""
    config = {
        "Skills": [
            "Data Analysis",
            "Python", 
            "SQL",
            "Machine Learning",
            "Power BI",
            "Excel",
            "Statistics",
            "Tableau"
        ],
        "Desired Roles": [
            "Data Analyst",
            "Junior Data Scientist", 
            "Risk Analyst",
            "Business Analyst",
            "Research Analyst"
        ],
        "Experience Level": {
            "Label": "Fresher",
            "Years": 0
        },
        "Desired City": "Pune"
    }
    
    with open('profile_data.json', 'w') as f:
        json.dump(config, f, indent=4)
    
    print("✅ Profile configuration updated!")

# Uncomment the line below to update your profile configuration
# update_profile_config()
