In [45]:
import pandas as pd
import requests
import time
import re
from collections import Counter
import google.generativeai as genai
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [13]:
df = pd.read_csv("Links_for_all_conferences.csv")
df

Unnamed: 0,Title,Link,Type
0,2025 PCI SSC North America Community Meeting,https://events.pcisecuritystandards.org/#,CyberSecurity
1,2025 PCI SSC Europe Community Meeting,https://events.pcisecuritystandards.org/#,CyberSecurity
2,2025 PCI SSC Asia-Pacific Community Meeting,https://events.pcisecuritystandards.org/#,CyberSecurity
3,RSA Conference 2025,https://www.rsaconference.com/usa,CyberSecurity
4,Innovate Cybersecurity Summit,https://innovatecybersecuritysummit.com/,CyberSecurity
...,...,...,...
232,Virginia HIMSS Annual Fall Conference,https://www.vahimss.org,Health Informatics
233,GNUHealth Con,https://www.gnuhealthcon.org,Health Informatics
234,AIMed Global Summit,https://aimed.events,Health Informatics
235,International Conference on Nursing and Health...,https://www.nursingconference.com,Health Informatics


In [14]:
conferences = df

In [15]:
conferences.Title = conferences.Title.astype(str)

In [16]:
#Let the scraping begin

In [None]:
# API Keys - Replace with your actual keys
GEMINI_API_KEY = ""
GOOGLE_SEARCH_API_KEY = ""
SEARCH_ENGINE_ID = ""

In [34]:
def google_search(query, api_key, search_engine_id, num_results=3):
    """Search Google using Custom Search API"""
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        'key': api_key,
        'cx': search_engine_id,
        'q': query,
        'num': num_results
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error searching for {query}: {e}")
        return None

In [50]:
def extract_keywords_with_gemini(text):
    """Extract relevant keywords from text using Gemini API"""
    import google.generativeai as genai
    
    # Configure Gemini API
    genai.configure(api_key=GEMINI_API_KEY)
    model = genai.GenerativeModel('gemini-2.5-pro-preview-05-06')
    
    # Create prompt for keyword extraction
    prompt = f"""
Analyze the following text and extract the 10 most relevant keywords and topics.

Focus on:
- Technical or specialized subject areas discussed
- Industry sectors or application domains referenced
- Major themes, challenges, or areas of focus
- Important methods, tools, or approaches mentioned

Return only a concise, comma-separated list of 5-10 highly relevant keywords or topics that best summarize the content. Do not include explanations.

Text to analyze:
{text[:2000]}  # Limit text length for API efficiency
"""

    
    try:
        response = model.generate_content(prompt)
        keywords_text = response.text.strip()
        
        # Clean and format keywords
        keywords = [k.strip().lower() for k in keywords_text.split(',') if k.strip()]
        return keywords[:10]  # Limit to 10 keywords max
        
    except Exception as e:
        print(f"Error with Gemini API: {e}")
        # Fallback to simple keyword extraction
        return extract_keywords_simple(text)

In [51]:
def analyze_conference_keywords(df):
    """Analyze each conference and extract keywords"""
    keywords_list = []
    
    for index, row in df.iterrows():
        print(f"Processing {index + 1}/{len(df)}: {row['Title']}")
        
        # Search for conference information
        search_query = f"{row['Title']} conference agenda topics"
        search_results = google_search(search_query, GOOGLE_SEARCH_API_KEY, SEARCH_ENGINE_ID)
        
        keywords = []
        if search_results and 'items' in search_results:
            # Extract text from search results
            all_text = ""
            for item in search_results['items']:
                all_text += item.get('title', '') + " " + item.get('snippet', '') + " "
            
            # Extract keywords from the text using Gemini
            keywords = extract_keywords_with_gemini(all_text)
        
        # If no keywords found, try searching with just the title
        if not keywords:
            search_query = f"{row['Title']}"
            search_results = google_search(search_query, GOOGLE_SEARCH_API_KEY, SEARCH_ENGINE_ID)
            
            if search_results and 'items' in search_results:
                all_text = ""
                for item in search_results['items']:
                    all_text += item.get('title', '') + " " + item.get('snippet', '') + " "
                keywords = extract_keywords_with_gemini(all_text)
        
        # Add conference type as keyword if no other keywords found
        if not keywords and 'Type' in row:
            keywords = [row['Type'].lower()]
        
        keywords_list.append(', '.join(keywords) if keywords else 'No keywords found')
        
        # Rate limiting to avoid API quota issues
        time.sleep(1)
    
    return keywords_list

In [52]:
print("\nStarting keyword extraction...")
keywords = analyze_conference_keywords(df)
df['Keywords'] = keywords


Starting keyword extraction...
Processing 1/237: 2025 PCI SSC North America Community Meeting
Processing 2/237: 2025 PCI SSC Europe Community Meeting
Processing 3/237: 2025 PCI SSC Asia-Pacific Community Meeting
Processing 4/237: RSA Conference 2025
Processing 5/237: Innovate Cybersecurity Summit
Processing 6/237: SAINTCON
Processing 7/237: Cloud & Cyber Security Expo
Processing 8/237: HIMSS
Processing 9/237: HIMSS Europe
Processing 10/237: WiCyS
Processing 11/237: Gartner Security & Risk Management Summit
Processing 12/237: ISACA North America Conference
Processing 13/237: GRC Conference 2025
Processing 14/237: ISACA Europe Conference
Processing 15/237: The Diana Initiative
Processing 16/237: Apres-Cyber Slopes Summit
Processing 17/237: DEF CON
Processing 18/237: Association of Payment Professionals
Processing 19/237: Black Hat
Processing 20/237: Zero Trust World (ThreatLocker)
Processing 21/237: ICCWS
Processing 22/237: HACKMIAMI
Processing 23/237: Techno Security & Digital Forensic

In [53]:
df.to_csv("With_Keywords.csv",index = False)