In [None]:
# data processing
import pandas as pd
import sqlite3

conn = sqlite3.connect('stock_dataset.db')

df = pd.read_csv('data/hackathon_sample_v2.csv')

df.to_sql('hackathon_sample_v2', conn, if_exists='replace', index=False)


filtered_df = pd.read_sql_query("SELECT year, month, intrinsic_value,stock_exret, stock_ticker,comp_name,be_me,ni_me,fcf_me,betadown_252d, ni_ar1, z_score, ebit_sale, at_turnover, market_equity FROM hackathon_sample_v2", conn)
filtered_df.dropna(inplace=True)  # Remove rows with any null values
filtered_df['roic'] = filtered_df['ebit_sale'] * filtered_df['at_turnover']

filtered_df['bvps'] = filtered_df['be_me']*filtered_df['market_equity']

# Sort by stock_ticker, year, and month to ensure correct order
filtered_df.sort_values(by=['stock_ticker', 'year', 'month'], inplace=True)

# Calculate the BVPS change from the previous month
filtered_df['prev_bvps'] = filtered_df.groupby('stock_ticker')['bvps'].shift(1)
filtered_df['bvps_change'] = (filtered_df['bvps'] - filtered_df['prev_bvps']) / filtered_df['prev_bvps']

# Calculate the at_turnover change for sales growth change from the previous month
filtered_df['prev_at_turnover'] = filtered_df.groupby('stock_ticker')['at_turnover'].shift(1)
filtered_df['at_turnover_change'] = (filtered_df['at_turnover'] - filtered_df['prev_at_turnover']) / filtered_df['prev_at_turnover']

# Calculate the earnings to price  growth change from the previous month
filtered_df['prev_ni_me'] = filtered_df.groupby('stock_ticker')['ni_me'].shift(1)
filtered_df['ni_me_change'] = (filtered_df['ni_me'] - filtered_df['prev_ni_me']) / filtered_df['prev_ni_me']

# Calculate the FCF to price  growth change from the previous month
filtered_df['prev_fcf_me'] = filtered_df.groupby('stock_ticker')['fcf_me'].shift(1)
filtered_df['fcf_me_change'] = (filtered_df['fcf_me'] - filtered_df['prev_fcf_me']) / filtered_df['prev_fcf_me']


# Save back to the database
filtered_df.to_sql('filtered_hackathon_sample_v2', conn, if_exists='replace', index=False)
filtered_df.to_csv('filtered_stock_data.csv', index=False)

conn.commit()
conn.close()


In [None]:
# load data into a pandas dataframe
import pandas as pd
import sqlite3

conn = sqlite3.connect('stock_dataset.db')

df = pd.read_sql_query("SELECT * FROM filtered_hackathon_sample_v2", conn)
conn.close()
print(df.head())

In [None]:
import pandas as pd
import os

# Load the full dataset
df = pd.read_csv('filtered_stock_data.csv')

# Create a folder for company files
os.makedirs("data", exist_ok=True)

# Save each company separately as CSV
for company in df['stock_ticker'].unique():
    safe_company_name = company.replace("/", "_")  # Replace '/' with '_'
    company_df = df[df['stock_ticker'] == company]
    
    # Save as CSV instead of Parquet
    company_df.to_csv(f'data/{safe_company_name}.csv', index=False)

print("All company data saved separately as CSV.")