# Configuration
## General Imports

In [1]:
# General Tools/Utilities
from pathlib import Path
import os
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta, date
import warnings
from multiprocessing import Pool, cpu_count
import re
from typing import List, Tuple
import buckaroo
from tqdm import tqdm
import gc
from fuzzywuzzy import fuzz, process
# Warnings
import warnings
warnings.filterwarnings('ignore')

Buckaroo has been enabled as the default DataFrame viewer.  To return to default dataframe visualization use `from buckaroo import disable; disable()`
must be running inside ipython to enable default display via enable()


## ML Imports

In [6]:
# Machine Learning Imports
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb

## Custom Imports

In [7]:
# Custom Imports
from data_loader import DataLoader
from jse_companies import jse_company_names
from technical_indicators import add_all_indicators, filter_buy_signals, get_signals_summary
from data import MarketDataPipeline, create_pipeline

ModuleNotFoundError: No module named 'config'

# Get Data

In [None]:
# df = pipeline.load_stock_data(
#     start_date='2017-01-01', 
#     end_date='2025-03-31'
# )

In [4]:
fileLoc = 'C:/Users/Joshh/Projects/Stocks/Data/joined_newspaper_data_v5.parquet'
df = pl.read_parquet(fileLoc)

In [5]:
df.head()

PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…

In [None]:
sent_fileLoc = 'C:/Users/Joshh/Projects/Stocks/Data/joined_newspaper_data_v5.parquet'

# Or use the factory function
integrated_data = create_pipeline(
    data_loader=DataLoader(),
    jse_companies=jse_company_names,
    sentiment_file_path=df_sentiLoc
)



In [18]:
import polars as pl

def load_sentiment(fileLoc: str) -> pl.DataFrame:
    df = pl.read_parquet(fileLoc)

    df = df.with_columns([
        pl.when((pl.col("title_fin_polarity") != 0) & (pl.col("text_fin_polarity") != 0))
          .then((pl.col("title_fin_polarity") + pl.col("text_fin_polarity")) / 2)
          .otherwise(pl.col("text_fin_polarity"))
          .alias("avg_polarity"),

        pl.when((pl.col("title_fin_subjectivity") != 0) & (pl.col("text_fin_subjectivity") != 0))
          .then((pl.col("title_fin_subjectivity") + pl.col("text_fin_subjectivity")) / 2)
          .otherwise(pl.col("text_fin_subjectivity"))
          .alias("avg_subjectivity")
    ])

    drop_cols = [
        'publication_date',
        'organizations',
        'sentiment_polarity',
        'sentiment_subjectivity',
        'title_polarity',
        'title_subjectivity',
        'title_fin_polarity',
        'title_fin_subjectivity',
        'text_fin_polarity',
        'text_fin_subjectivity',
    ]
    df = df.drop([col for col in drop_cols if col in df.columns])

    return df


In [20]:
df_test = load_sentiment(sent_fileLoc)
df_test.head()

PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…

In [10]:
df.columns

['publication_date',
 'organizations',
 'title',
 'sentiment_polarity',
 'sentiment_subjectivity',
 'title_polarity',
 'title_subjectivity',
 'title_fin_polarity',
 'title_fin_subjectivity',
 'text_fin_polarity',
 'text_fin_subjectivity',
 'institution',
 'date',
 'symbol']

In [9]:
sent_fileLoc = 'C:/Users/Joshh/Projects/Stocks/Data/joined_newspaper_data_v5.parquet'
df =pl.read_parquet(sent_fileLoc)
df.head(10)

PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…

In [None]:
df.filter(pl.col('sentiment_polarity') != pl.col('text_fin_polarity'))

PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…