In [None]:
import requests
import pandas as pd
import time
from pytrends.request import TrendReq

# --- Mediastack API Setup ---
MEDIASTACK_API_KEY = "27c110d2e323cfac65172c9617b73ed8"
MEDIASTACK_API_URL = "http://api.mediastack.com/v1/news"
MEDIASTACK_QUERY = "AI jobs, automation, artificial intelligence workforce"
TOTAL_ARTICLES_NEEDED = 100  
PAGE_SIZE = 50  

# --- NewsAPI Setup ---
NEWS_API_KEY = "35ad2c01903e4ef191d28e5a747d7bd6"
NEWS_API_URL = "https://newsapi.org/v2/everything"
NEWS_QUERY = "AI jobs OR automation OR artificial intelligence workforce"

LABEL_KEYWORDS = {
    "Pro-AI": ["AI creates jobs", "AI improves productivity", "AI job growth"],
    "Anti-AI": ["AI replaces jobs", "AI causes unemployment", "AI job loss"]
}

def fetch_mediastack_news():
    all_articles = []
    offset = 0

    while len(all_articles) < TOTAL_ARTICLES_NEEDED:
        params = {
            "access_key": MEDIASTACK_API_KEY,
            "keywords": MEDIASTACK_QUERY,
            "languages": "en",
            "sort": "published_desc",
            "limit": PAGE_SIZE,
            "offset": offset
        }
        response = requests.get(MEDIASTACK_API_URL, params=params)

        print(f"Mediastack API Request Status: {response.status_code}")

        if response.status_code != 200:
            print("❌ Error:", response.json())
            break  

        # Access the raw JSON data
        raw_json_data = response.json()
        print("\nRaw JSON Data from Mediastack:")
        print(raw_json_data)

        articles = raw_json_data.get("data", [])

        if not articles:
            print("⚠️ No more articles available.")
            break  

        for article in articles:
            label = classify_article(article["title"], article["description"])
            all_articles.append((article["title"], article["description"], article["url"], label))

        offset += PAGE_SIZE
        time.sleep(1)  

    df = pd.DataFrame(all_articles[:TOTAL_ARTICLES_NEEDED], columns=["Title", "Description", "URL", "Label"])

    print("\nMediastack Articles Preview:")
    print(df.head())

    return df

# --- Function to Fetch News from NewsAPI ---
def fetch_newsapi_news():
    all_articles = []
    page = 1

    while len(all_articles) < TOTAL_ARTICLES_NEEDED:
        params = {
            "q": NEWS_QUERY,
            "apiKey": NEWS_API_KEY,
            "language": "en",
            "sortBy": "relevancy",
            "pageSize": PAGE_SIZE,
            "page": page
        }
        response = requests.get(NEWS_API_URL, params=params)

        print(f"NewsAPI Request Status: {response.status_code}")

        if response.status_code != 200:
            print("❌ Error:", response.json())
            break  

        # Access the raw JSON data
        raw_json_data = response.json()
        print("\nRaw JSON Data from NewsAPI:")
        print(raw_json_data)

        articles = raw_json_data.get("articles", [])

        if not articles:
            print("⚠️ No more articles available.")
            break  

        for article in articles:
            label = classify_article(article["title"], article["description"])
            all_articles.append((article["title"], article["description"], article["url"], label))

        page += 1
        time.sleep(1)  

    df = pd.DataFrame(all_articles[:TOTAL_ARTICLES_NEEDED], columns=["Title", "Description", "URL", "Label"])

    print("\nNewsAPI Articles Preview:")
    print(df.head())

    return df

def classify_article(title, description):
    for key, keywords in LABEL_KEYWORDS.items():
        if any(keyword in title or keyword in description for keyword in keywords):
            return key
    return "Neutral"

# --- Function to Fetch Google Trends Data ---
def fetch_google_trends():
    pytrends = TrendReq()
    keywords = ["AI job automation", "AI replacing jobs", "AI job opportunities", "Future of work AI"]
    pytrends.build_payload(kw_list=keywords, timeframe='today 12-m', geo='US')
    trends_data = pytrends.interest_over_time()
    
    if 'isPartial' in trends_data.columns:
        trends_data = trends_data.drop(columns=['isPartial'])  

    print("\nGoogle Trends Data Preview:")
    print(trends_data.head())

    return trends_data

if __name__ == "__main__":
    mediastack_df = fetch_mediastack_news()
    newsapi_df = fetch_newsapi_news()
    trends_df = fetch_google_trends()

    if not mediastack_df.empty:
        mediastack_df.to_csv("mediastack_news.csv", index=False)
        print("✅ Mediastack Data Saved: mediastack_news.csv")

    if not newsapi_df.empty:
        newsapi_df.to_csv("newsapi_news.csv", index=False)
        print("✅ NewsAPI Data Saved: newsapi_news.csv")

    if not trends_df.empty:
        trends_df.to_csv("google_trends_data.csv", index=True)
        print("✅ Google Trends Data Saved: google_trends_data.csv")


Mediastack API Request Status: 200

Raw JSON Data from Mediastack:
{'pagination': {'limit': 50, 'offset': 0, 'count': 0, 'total': 0}, 'data': []}
⚠️ No more articles available.

Mediastack Articles Preview:
Empty DataFrame
Columns: [Title, Description, URL, Label]
Index: []
NewsAPI Request Status: 200

Raw JSON Data from NewsAPI:
{'status': 'ok', 'totalResults': 920, 'articles': [{'source': {'id': None, 'name': 'Harvard Business Review'}, 'author': None, 'title': 'Managing the Future of Work: Microsoft’s AI Perspective', 'description': 'Microsoft executive Jared Spataro discusses adoption of artificial intelligence.', 'url': 'https://hbr.org/podcast/2025/01/managing-the-future-of-work-microsofts-ai-perspective', 'urlToImage': 'https://hbr.org/resources/images/article_assets/2024/11/wide-cold-call-hbr-24.png', 'publishedAt': '2025-01-21T14:22:00Z', 'content': 'BRIAN KENNY: If you’re a regular listener, you know that we’ve tackled the topic of AI multiple times over the past year. And fo

TooManyRequestsError: The request failed: Google returned a response with code 429

In [None]:
import nltk
nltk.download('punkt')
import requests
import pandas as pd
import time
from datetime import datetime, timedelta

# --- NewsAPI Setup ---
NEWS_API_KEY = "72bd1ccbcc75426c89a21dc3f7c52eab"
NEWS_API_URL = "https://newsapi.org/v2/everything"
NEWS_QUERY = "AI jobs OR automation OR artificial intelligence workforce"

TOTAL_ARTICLES_NEEDED = 100  
PAGE_SIZE = 50  


DAYS_BACK_1 = 0   # Fetch latest articles
DAYS_BACK_2 = 30  # Fetch older articles (from 30-60 days ago)

DATE_FROM_OLD = (datetime.now() - timedelta(days=DAYS_BACK_2 + 30)).strftime("%Y-%m-%d")
DATE_TO_OLD = (datetime.now() - timedelta(days=DAYS_BACK_2)).strftime("%Y-%m-%d")


def fetch_newsapi_news(date_from=None, date_to=None, filename="newsapi_news.csv"):
    all_articles = []
    page = 1

    while len(all_articles) < TOTAL_ARTICLES_NEEDED:
        params = {
            "q": NEWS_QUERY,
            "apiKey": NEWS_API_KEY,
            "language": "en",
            "sortBy": "publishedAt",
            "pageSize": PAGE_SIZE,
            "page": page
        }

        if date_from and date_to:
            params["from"] = date_from
            params["to"] = date_to

        response = requests.get(NEWS_API_URL, params=params)

        print(f"Fetching NewsAPI Articles for {filename} | Status: {response.status_code}")

        if response.status_code != 200:
            print("❌ Error:", response.json())
            break 

        data = response.json()
        articles = data.get("articles", [])

        if not articles:
            print("⚠️ No more articles available.")
            break 

        for article in articles:
            all_articles.append((article["title"], article["description"], article["url"]))

        page += 1
        time.sleep(1)

    df = pd.DataFrame(all_articles[:TOTAL_ARTICLES_NEEDED], columns=["Title", "Description", "URL"])

    print(f"\nPreview of {filename}:")
    print(df.head())

    if not df.empty:
        df.to_csv(filename, index=False)
        print(f"✅ Data Saved: {filename}")

    return df



if __name__ == "__main__":
    
    latest_newsapi_df = fetch_newsapi_news(filename="newsapi_news.csv")

    second_newsapi_df = fetch_newsapi_news(date_from=DATE_FROM_OLD, date_to=DATE_TO_OLD, filename="secondnewsapi_news.csv")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saahi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fetching NewsAPI Articles for newsapi_news.csv | Status: 200
Fetching NewsAPI Articles for newsapi_news.csv | Status: 200

Preview of newsapi_news.csv:
                                               Title  \
0  International universities give graduates an edge   
1                   Growing Malaysia’sAI talent pool   
2  Only 9% of Japanese people have used generativ...   
3  As Meta and tech giants trim jobs, will 2025 w...   
4  The Rise Of The Hybrid Workforce: Humans And A...   

                                         Description  \
0  KUALA LUMPUR: Students should choose an intern...   
1  WHILE Malaysia strives to embed artificial int...   
2  While 61 percent of Japanese people are aware ...   
3  Meta eliminated 3,000 positions, while Workday...   
4  We are moving from the hybrid workplace, with ...   

                                                 URL  
0  https://www.thestar.com.my/news/nation/2025/02...  
1  https://www.thestar.com.my/news/education/2025...  
2  https:

In [27]:
import nltk
nltk.download('punkt')       # Tokenizer models
nltk.download('wordnet')     # Lemmatization models
nltk.download('omw-1.4')     # WordNet data
nltk.download('stopwords')   # Stopwords (optional)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saahi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saahi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\saahi\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saahi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
import nltk
import os

# Set the NLTK data path
nltk.data.path.append(os.path.expanduser("~/nltk_data"))
nltk.download('punkt', download_dir=os.path.expanduser("~/nltk_data"))
nltk.download('wordnet', download_dir=os.path.expanduser("~/nltk_data"))
nltk.download('omw-1.4', download_dir=os.path.expanduser("~/nltk_data"))


[nltk_data] Downloading package punkt to C:\Users\saahi/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to C:\Users\saahi/nltk_data...
[nltk_data] Downloading package omw-1.4 to C:\Users\saahi/nltk_data...


True

In [31]:
import nltk
nltk.data.find('tokenizers/punkt')


FileSystemPathPointer('C:\\Users\\saahi\\nltk_data\\tokenizers\\punkt')

In [None]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    return text.lower() if isinstance(text, str) else ""

def merge_csv_files(file1, file2, output_file):
    
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    
    
    if "Label" in df2.columns:
        df1["Label"] = df2["Label"].reindex(df1.index, fill_value="Neutral")
    else:
        df1["Label"] = "Neutral"
    
     
    merged_df = pd.concat([df1, df2], ignore_index=True)
    
    labeled_count = merged_df['Label'].replace("Neutral", np.nan).count()
    if labeled_count / len(merged_df) < 0.5:
        merged_df['Label'] = np.where(merged_df['Label'] == "Neutral", "Unknown", merged_df['Label'])
    
    merged_df['Processed_Description'] = merged_df['Description'].apply(preprocess_text)
    
    merged_df['Stemmed'] = merged_df['Processed_Description'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    
    merged_df['Lemmatized'] = merged_df['Processed_Description'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    
    vectorizer = CountVectorizer(max_df=0.85, min_df=2, max_features=1000)
    count_vectorized = vectorizer.fit_transform(merged_df['Processed_Description'])
    count_df = pd.DataFrame(count_vectorized.toarray(), columns=vectorizer.get_feature_names_out())
    
    # Create a DF using TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorized = tfidf_vectorizer.fit_transform(merged_df['Processed_Description'])
    tfidf_df = pd.DataFrame(tfidf_vectorized.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    
    # Save the merged dataframe as a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"Merged file saved as: {output_file}")

file1_path = "newsapi_news.csv"
file2_path = "newsapi_news_data.csv"
output_file_path = "merged_newsapi_data.csv"

merge_csv_files(file1_path, file2_path, output_file_path)

Merged file saved as: merged_newsapi_data.csv
