In [1]:
!pip install requests beautifulsoup4 pandas




In [None]:
import requests
import pandas as pd
from time import sleep
from datetime import datetime
import random
from google.colab import files

key = "Insert Google API"
search_url = "https://www.googleapis.com/youtube/v3/search"
video_url = "https://www.googleapis.com/youtube/v3/videos"

# List of queries
queries = [
    # General AI & ML
    "artificial intelligence",
    "machine learning",
    "deep learning",
    "reinforcement learning",
    "supervised learning",
    "unsupervised learning",
    "semi-supervised learning",
    "self-supervised learning",
    "neural networks",
    "transformer models",
    "generative AI",
    "AI applications",
    "AI in healthcare",
    "AI in finance",
    "AI in robotics",
    "AI in education",
    "machine learning algorithms",
    "automated machine learning",
    "AI ethics",
    "explainable AI",
    "AI safety",
    "AI governance",

    # Core Algorithms & Techniques
    # Supervised
    "linear regression",
    "logistic regression",
    "decision trees",
    "random forest classifier",
    "support vector machines",
    "naive Bayes classifier",
    "k-nearest neighbors classifier",
    "gradient boosting machines",
    "XGBoost classifier",
    "LightGBM classifier",
    "CatBoost classifier",
    "ridge regression",
    "lasso regression",
    "elastic net regression",
    "multi-class classification",
    "ordinal regression",
    "ensemble methods in machine learning",
    "stacking ensemble learning",
    "bagging and boosting",
    "regression trees",
    "classification algorithms",
    "cross-validation techniques",

    # Unsupervised
    "k-means clustering",
    "hierarchical clustering",
    "density-based clustering",
    "DBSCAN algorithm",
    "Gaussian mixture models",
    "mean shift clustering",
    "spectral clustering",
    "affinity propagation",
    "self-organizing maps",
    "principal component analysis PCA",
    "independent component analysis ICA",
    "t-SNE visualization",
    "UMAP dimensionality reduction",
    "autoencoders for representation learning",
    "deep clustering methods",
    "latent Dirichlet allocation LDA",
    "topic modeling",
    "anomaly detection",
    "outlier detection algorithms",

    # Deep Learning & Neural Architectures
    "convolutional neural networks",
    "recurrent neural networks",
    "long short-term memory",
    "transformers",
    "attention mechanisms",
    "vision transformers",
    "GANs generative adversarial networks",
    "BERT model",
    "GPT models",
    "diffusion models",
    "multi-modal learning",
    "zero-shot learning",
    "few-shot learning",
    "meta learning",
    "neural architecture search",

    # Libraries & Frameworks
    "TensorFlow machine learning",
    "PyTorch deep learning",
    "Scikit-learn algorithms",
    "Keras deep learning",
    "Hugging Face transformers",
    "JAX ML library",
    "ONNX AI models",

    # Evaluation, Fairness, and Interpretability
    "model evaluation in machine learning",
    "model interpretability",
    "model explainability",
    "fairness in machine learning",
    "bias in AI models",
    "AUC ROC evaluation",
    "precision recall tradeoff",
    "SHAP values",
    "LIME explainability",

    # AI Research Topics & Trends
    "foundation models",
    "large language models",
    "AI and climate change",
    "AI for social good",
    "neurosymbolic AI",
    "human-in-the-loop learning",
    "online learning",
    "continual learning",
    "federated learning",
    "privacy preserving machine learning",
    "causal inference in ML",
    "contrastive learning",
    "representation learning"
]

def fetch_video(video_ids):
    """Fetch extra details about videos using the Videos endpoint"""
    params = {
        "part": "snippet,contentDetails,statistics",
        "id": ",".join(video_ids),
        "key": key
    }
    response = requests.get(video_url, params=params)
    response.raise_for_status()
    return response.json()["items"]

def scrape_youtube(query):
    params = {
        "part": "snippet",
        "q": query,
        "type": "video",
        "maxResults": 10,
        "key": key
    }
    response = requests.get(search_url, params=params)
    response.raise_for_status()
    search_results = response.json()["items"]

    video_ids = [item["id"]["videoId"] for item in search_results]
    video_details = fetch_video(video_ids)

    results = []
    for video in video_details:
        results.append({
            "query": query,
            "video_id": video["id"],
            "title": video["snippet"]["title"],
            "channel": video["snippet"]["channelTitle"],
            "publish_date": video["snippet"]["publishedAt"],
            "duration": video["contentDetails"]["duration"],
            "views": video.get("statistics", {}).get("viewCount", "N/A"),
            "url": f"https://www.youtube.com/watch?v={video['id']}",
            "timestamp_scraped": datetime.now().isoformat()
        })

    return results

# Collect results for all queries
all_results = []
for q in queries:
    try:
        all_results.extend(scrape_youtube(q))
    except Exception as e:
        print(f"Error with query '{q}': {e}")
    sleep(random.uniform(1.5, 3.0))  # Random sleep to avoid runtime error from youtube

# Remove duplicates (by video_id)
df = pd.DataFrame(all_results).drop_duplicates(subset="video_id")

# Save to file with timestamp
file = datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f"youtube AI&ML {file}.csv"
df.to_csv(file_name, index=False)

# For Google Colab users
try:
    files.download(file_name)
except ImportError:
    print(f"CSV file saved locally as {file_name}.")

df.head()

