In [None]:
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

KEYWORDS = [
    "climate change", "carbon emission", "carbon tax",
    "carbon price", "EU ETS", "greenhouse gas"
]

DATE_LIMIT = datetime(2020, 1, 1)
BASE_URL = "https://oilprice.com/search/tab/news/{}"
CSV_FILE = "oilprice_news.csv"

def parse_date(date_str):
    try:
        return datetime.strptime(date_str.strip(), "%d %B %Y")
    except:
        return None

def scrape_keyword(keyword):
    print(f"\n🔍 Searching for: {keyword}")
    search_term = keyword.replace(" ", "_")
    all_results = []
    page = 1
    keep_going = True

    while keep_going:
        if page == 1:
            url = BASE_URL.format(search_term)
        else:
            url = f"https://oilprice.com/search/tab/news/{search_term}/Page-{page}.html"

        print(f"   ➤ Fetching page {page}")
        try:
            res = requests.get(url, headers=HEADERS, timeout=10)
            res.raise_for_status()
        except requests.RequestException as e:
            print(f"   ❌ Request failed: {e}")
            break

        soup = BeautifulSoup(res.text, "html.parser")
        items = soup.find_all("li")

        if not items:
            break

        for item in items:
            try:
                a_tag = item.find("h3").find("a")
                title = a_tag.text.strip()
                link = a_tag['href']

                date_tag = item.find("div", class_="dateadded")
                pub_date = parse_date(date_tag.text if date_tag else "")
                if not pub_date or pub_date < DATE_LIMIT:
                    keep_going = False
                    break

                snippet = item.find("p").text.strip() if item.find("p") else ""

                all_results.append({
                    "keyword": keyword,
                    "title": title,
                    "url": link,
                    "date": pub_date.strftime("%Y-%m-%d"),
                    "snippet": snippet
                })
            except Exception:
                continue

        page += 1
        time.sleep(random.uniform(2, 4))  # respectful delay

    return all_results

# Main execution loop
for kw in KEYWORDS:
    results = scrape_keyword(kw)
    if results:
        df = pd.DataFrame(results)
        df.to_csv(CSV_FILE, mode='a', index=False, header=not pd.io.common.file_exists(CSV_FILE))
        print(f"✅ Saved {len(results)} articles for '{kw}'")
    else:
        print(f"⚠️ No results for '{kw}'")

    print("⏳ Waiting 10 seconds before next keyword...\n")
    time.sleep(10)



🔍 Searching for: climate change
   ➤ Fetching page 1
   ➤ Fetching page 2
   ➤ Fetching page 3
   ➤ Fetching page 4
   ➤ Fetching page 5
   ➤ Fetching page 6
   ➤ Fetching page 7
   ➤ Fetching page 8
   ➤ Fetching page 9
   ➤ Fetching page 10
   ➤ Fetching page 11
   ➤ Fetching page 12
   ➤ Fetching page 13
   ➤ Fetching page 14
   ➤ Fetching page 15
   ➤ Fetching page 16
   ➤ Fetching page 17
   ➤ Fetching page 18
   ➤ Fetching page 19
   ➤ Fetching page 20
   ➤ Fetching page 21
   ➤ Fetching page 22
   ➤ Fetching page 23
   ➤ Fetching page 24
   ➤ Fetching page 25
   ➤ Fetching page 26
   ➤ Fetching page 27
   ➤ Fetching page 28
   ➤ Fetching page 29
   ➤ Fetching page 30
   ➤ Fetching page 31
   ➤ Fetching page 32
   ➤ Fetching page 33
   ➤ Fetching page 34
   ➤ Fetching page 35
✅ Saved 689 articles for 'climate change'
⏳ Waiting 10 seconds before next keyword...


🔍 Searching for: carbon emission
   ➤ Fetching page 1
   ➤ Fetching page 2
   ➤ Fetching page 3
   ➤ Fetching page 4
 

In [None]:
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

KEYWORDS = [
     "EU ETS"
]

DATE_LIMIT = datetime(2015, 1, 1)  # 10-year limit
BASE_URL = "https://oilprice.com/search/tab/news/{}"
CSV_FILE = "oilprice_news_with_content.csv"

def parse_date(date_str):
    try:
        return datetime.strptime(date_str.strip(), "%d %B %Y")
    except:
        return None

def get_article_content(article_url):
    try:
        res = requests.get(article_url, headers=HEADERS, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")
        content_div = soup.find("div", class_="article-content")
        if content_div:
            paragraphs = content_div.find_all("p")
            content = " ".join(p.get_text(strip=True) for p in paragraphs)
            return content.strip()
    except:
        return ""

def scrape_keyword(keyword):
    print(f"\n🔍 Searching for: {keyword}")
    search_term = keyword.replace(" ", "_")
    all_results = []
    page = 1
    keep_going = True

    while keep_going:
        if page == 1:
            url = BASE_URL.format(search_term)
        else:
            url = f"https://oilprice.com/search/tab/news/{search_term}/Page-{page}.html"

        print(f"   ➤ Fetching page {page}: {url}")
        try:
            res = requests.get(url, headers=HEADERS, timeout=10)
            res.raise_for_status()
        except requests.RequestException as e:
            print(f"   ❌ Request failed: {e}")
            break

        soup = BeautifulSoup(res.text, "html.parser")
        items = soup.find_all("li")

        if not items:
            break

        for item in items:
            try:
                a_tag = item.find("h3").find("a")
                title = a_tag.text.strip()
                link = a_tag['href']

                date_tag = item.find("div", class_="dateadded")
                pub_date = parse_date(date_tag.text if date_tag else "")
                if not pub_date or pub_date < DATE_LIMIT:
                    keep_going = False
                    break

                snippet = item.find("p").text.strip() if item.find("p") else ""
                content = get_article_content(link)

                all_results.append({
                    "keyword": keyword,
                    "title": title,
                    "url": link,
                    "date": pub_date.strftime("%Y-%m-%d"),
                    "snippet": snippet,
                    "content": content
                })
                time.sleep(random.uniform(1.5, 3))  # respectful delay per article
            except Exception:
                continue

        page += 1
        time.sleep(random.uniform(2, 4))  # respectful delay per page

    return all_results

# Main execution loop
for kw in KEYWORDS:
    results = scrape_keyword(kw)
    if results:
        df = pd.DataFrame(results)
        df.to_csv(CSV_FILE, mode='a', index=False, header=not pd.io.common.file_exists(CSV_FILE))
        print(f"✅ Saved {len(results)} articles for '{kw}'")
    else:
        print(f"⚠️ No results for '{kw}'")

    print("⏳ Waiting 10 seconds before next keyword...\n")
    time.sleep(10)



🔍 Searching for: EU ETS
   ➤ Fetching page 1: https://oilprice.com/search/tab/news/EU_ETS
   ➤ Fetching page 2: https://oilprice.com/search/tab/news/EU_ETS/Page-2.html
   ➤ Fetching page 3: https://oilprice.com/search/tab/news/EU_ETS/Page-3.html
   ➤ Fetching page 4: https://oilprice.com/search/tab/news/EU_ETS/Page-4.html
   ➤ Fetching page 5: https://oilprice.com/search/tab/news/EU_ETS/Page-5.html
   ➤ Fetching page 6: https://oilprice.com/search/tab/news/EU_ETS/Page-6.html
   ➤ Fetching page 7: https://oilprice.com/search/tab/news/EU_ETS/Page-7.html
   ➤ Fetching page 8: https://oilprice.com/search/tab/news/EU_ETS/Page-8.html
   ➤ Fetching page 9: https://oilprice.com/search/tab/news/EU_ETS/Page-9.html
   ➤ Fetching page 10: https://oilprice.com/search/tab/news/EU_ETS/Page-10.html
   ➤ Fetching page 11: https://oilprice.com/search/tab/news/EU_ETS/Page-11.html
   ➤ Fetching page 12: https://oilprice.com/search/tab/news/EU_ETS/Page-12.html
   ➤ Fetching page 13: https://oilprice.com/s

Exploring feature similairty using KL divergence in attempt to use yahoo finance for transfer learning

In [None]:
# transfer learning attempt

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import yfinance as yf
from scipy.stats import entropy
from sklearn.metrics import r2_score

# ==== CONFIGURATION ====
PRETRAIN_EPOCHS = 500
FINE_TUNE_EPOCHS = 100

# ==== Load Public Financial Dataset (Yahoo Finance) for Pretraining ====
def load_yahoo_finance_data():
    tickers = ['SPY', 'GLD', 'USO']
    df = yf.download(tickers, start='2015-01-01', end='2022-01-01')
    df = df['Close'].dropna()

    # Daily returns
    for ticker in tickers:
        df[f'{ticker}_pct_change'] = df[ticker].pct_change()

    # Rolling features
    for ticker in tickers:
        df[f'{ticker}_rolling_mean_5'] = df[ticker].rolling(5).mean()
        df[f'{ticker}_rolling_std_5'] = df[ticker].rolling(5).std()
        df[f'{ticker}_momentum_5'] = df[ticker] - df[ticker].shift(5)

    # Cross-ticker ratios
    df['SPY_GLD_ratio'] = df['SPY'] / df['GLD']
    df['SPY_USO_spread'] = df['SPY'] - df['USO']

    # Add target (future SPY price or return)
    df['target'] = df['SPY'].shift(-1)

    return df.dropna()


# ==== Load Your Dataset ====
def load_your_data(path):
    df = pd.read_csv(path, parse_dates=['date'])
    df.dropna(inplace=True)

    features = ['Crude Oil Prices: Brent - Europe', 'Closing Coal Prices', 'global_clean_index', 'production volume in industry', ' Price Index for Consumer: Energy(electricity, heat, fuel) ', 'euro_stoxx', 'ecb_interest_rate', 'label', ' Price Index for Consumer: Energy(electricity, heat, fuel) _lag1', 'Crude Oil Prices: Brent - Europe_lag1',  ' Price Index for Consumer: Energy(electricity, heat, fuel) _lag2', 'Crude Oil Prices: Brent - Europe_lag2', ' Price Index for Consumer: Energy(electricity, heat, fuel) _lag3', 'Crude Oil Prices: Brent - Europe_lag3', 'label_lag1', 'label_lag2', 'label_lag3']
    label = 'label'
    return df[features].values, df[label].values

# ==== Define Model ====
'''
class FinancialRegressor(nn.Module):
    def __init__(self, input_dim):
        super(FinancialRegressor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)
'''
class FinancialLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(FinancialLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Reshape the input to have a sequence length of 1
        x = x.unsqueeze(1)  # Adds a dimension for sequence length

        out, _ = self.lstm(x)
        out = out[:, -1, :]  # Take the last time step
        return self.fc(out)

# ==== Pretrain Model ====
def pretrain_model():
    df = load_yahoo_finance_data()
    X = df.drop(columns='target').values
    y = df['target'].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    #model = FinancialRegressor(X.shape[1])
    model = FinancialLSTM(input_size=X.shape[1], hidden_size=64, num_layers=2, output_size=1)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

    for epoch in range(PRETRAIN_EPOCHS):
        optimizer.zero_grad()
        output = model(X_tensor)
        loss = criterion(output, y_tensor)
        loss.backward()
        optimizer.step()
        if epoch % 5 == 0:
            print(f"[Pretrain] Epoch {epoch}: Loss = {loss.item():.4f}")

    torch.save(model.state_dict(), 'pretrained_model.pt')
    return scaler

# ==== Fine-Tune Model ====
def fine_tune_model(scaler, your_data_path):
    X, y = load_your_data(your_data_path)

    # === KL Divergence Measurement ===
    pretrain_df = load_yahoo_finance_data()
    pretrain_features = pretrain_df.drop(columns='target').values
    pretrain_features = scaler.transform(pretrain_features)

    # Ensure both datasets have the same number of rows
    min_rows = min(pretrain_features.shape[0], X.shape[0])
    pretrain_features = pretrain_features[:min_rows]
    X = X[:min_rows]

    # Normalize features to probability distributions (per feature)
    def normalize_features(X):
        X = X - X.min(axis=0) + 1e-8  # avoid negatives
        X = X / X.sum(axis=0)         # sum to 1 for each feature
        return X

    p = normalize_features(pretrain_features)
    q = normalize_features(X)

    kl_divergences = [entropy(p[:, i], q[:, i]) for i in range(p.shape[1])]
    print("\n📊 KL Divergence (Pretraining vs Fine-tuning dataset):")
    for i, kl in enumerate(kl_divergences):
        print(f"  Feature {i}: KL = {kl:.4f}")
    print(f"  Mean KL Divergence: {np.mean(kl_divergences):.4f}\n")


    # Create a new StandardScaler for fine-tuning data
    fine_tune_scaler = StandardScaler()
    X = fine_tune_scaler.fit_transform(X)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

    #model = FinancialRegressor(X.shape[1])
    model = FinancialLSTM(input_size=X.shape[1], hidden_size=64, num_layers=2, output_size=1)
    model.load_state_dict(torch.load('pretrained_model.pt'))

    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.MSELoss()

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)

    for epoch in range(FINE_TUNE_EPOCHS):
        model.train()
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = criterion(output, y_train_tensor)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_output = model(X_val_tensor)
            val_loss = criterion(val_output, y_val_tensor)

        if epoch % 10 == 0:
            print(f"[Fine-Tune] Epoch {epoch}: Train Loss = {loss.item():.4f}, Val Loss = {val_loss.item():.4f}")

    # === Final R² Score ===
    r2 = r2_score(y_val_tensor.numpy(), val_output.numpy())
    print(f"\n📈 Final R² Score on Validation Set: {r2:.4f}")


if __name__ == "__main__":
    scaler = pretrain_model()
    fine_tune_model(scaler, your_data_path='/content/final_dataset (1).csv')


[*********************100%***********************]  3 of 3 completed


[Pretrain] Epoch 0: Loss = 70986.6094
[Pretrain] Epoch 5: Loss = 70967.5000
[Pretrain] Epoch 10: Loss = 70947.0938
[Pretrain] Epoch 15: Loss = 70923.5938
[Pretrain] Epoch 20: Loss = 70894.6953
[Pretrain] Epoch 25: Loss = 70857.4141
[Pretrain] Epoch 30: Loss = 70808.0391
[Pretrain] Epoch 35: Loss = 70742.2344
[Pretrain] Epoch 40: Loss = 70655.6797
[Pretrain] Epoch 45: Loss = 70545.1641
[Pretrain] Epoch 50: Loss = 70409.3984
[Pretrain] Epoch 55: Loss = 70248.9688
[Pretrain] Epoch 60: Loss = 70065.2969
[Pretrain] Epoch 65: Loss = 69859.5625
[Pretrain] Epoch 70: Loss = 69632.4062
[Pretrain] Epoch 75: Loss = 69384.1328
[Pretrain] Epoch 80: Loss = 69115.4453
[Pretrain] Epoch 85: Loss = 68828.0234
[Pretrain] Epoch 90: Loss = 68524.6641
[Pretrain] Epoch 95: Loss = 68209.0391
[Pretrain] Epoch 100: Loss = 67885.2578
[Pretrain] Epoch 105: Loss = 67557.5547
[Pretrain] Epoch 110: Loss = 67229.9688
[Pretrain] Epoch 115: Loss = 66906.0547
[Pretrain] Epoch 120: Loss = 66588.7734
[Pretrain] Epoch 125: 

[*********************100%***********************]  3 of 3 completed


📊 KL Divergence (Pretraining vs Fine-tuning dataset):
  Feature 0: KL = 0.2148
  Feature 1: KL = 0.3761
  Feature 2: KL = 1.1188
  Feature 3: KL = 0.0425
  Feature 4: KL = 0.7219
  Feature 5: KL = 0.0803
  Feature 6: KL = 13.4092
  Feature 7: KL = 0.5722
  Feature 8: KL = 0.6914
  Feature 9: KL = 0.2283
  Feature 10: KL = 0.8446
  Feature 11: KL = 0.0957
  Feature 12: KL = 1.0818
  Feature 13: KL = 0.4316
  Feature 14: KL = 0.8028
  Feature 15: KL = 0.3188
  Feature 16: KL = 0.2597
  Mean KL Divergence: 1.2524






ValueError: Found input variables with inconsistent numbers of samples: [1757, 2370]

Feature 6 is very different
