In [8]:
import duckdb
import pandas as pd
# Now use pandas/scikit-learn for modeling
from sklearn.linear_model import LinearRegression
import numpy as np

In [10]:
# Connect to (or create) a DuckDB file
conn = duckdb.connect('../db/processed/rossmann.duckdb')



In [11]:
# Option 1: SQL queries directly in DuckDB
# Perform calculations using SQL
result_df = conn.execute('''
                         SELECT Store,
                                AVG(Sales)    as avg_sales,
                                SUM(Sales)    as total_sales,
                                COUNT(*)      as days_open,
                                MIN(Sales)    as min_sales,
                                MAX(Sales)    as max_sales,
                                STDDEV(Sales) as sales_std
                         FROM rossmann_sales
                         WHERE Open = 1
                         GROUP BY Store
                         ORDER BY total_sales DESC
                         ''').fetchdf()

# Display the results
print(result_df.head())



   Store     avg_sales  total_sales  days_open  min_sales  max_sales  \
0    262  20426.218954   18751269.0        918      13210      37376   
1    817  21757.483418   17057867.0        784       6052      38025   
2    562  17958.629630   16486022.0        918      11024      28680   
3   1114  20666.562500   16202585.0        784       8880      35697   
4    251  19123.068036   14896870.0        779       8373      35350   

     sales_std  
0  4253.285380  
1  4674.803920  
2  2898.286359  
3  3452.938601  
4  3547.641612  


In [12]:
# Option 2: Load data into Pandas first, then calculate
# Fetch the data you need from DuckDB
sales_data = conn.execute('''
                          SELECT *
                          FROM rossmann_sales
                          WHERE Open = 1
                          ''').fetchdf()




# Example with simple features
X = sales_data[['Promo', 'DayOfWeek', 'SchoolHoliday']]
y = sales_data['Sales']

# Train a simple model
model = LinearRegression()
model.fit(X, y)
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")




Coefficients: [2158.53530848 -137.49911265  138.3081866 ]
Intercept: 6447.435725763866


In [13]:
# Option 3: Use DuckDB's data registration to work with pandas DataFrames

feature_df = pd.DataFrame({
    'feature1': np.random.rand(100),
    'feature2': np.random.rand(100)
})

# Register the DataFrame with DuckDB
conn.register('features', feature_df)

# Now you can use this DataFrame in SQL queries
result = conn.execute('''
                      SELECT AVG(feature1) as avg_f1, MAX(feature2) as max_f2
                      FROM features
                      ''').fetchdf()

print(result)

# Don't forget to close the connection when done
conn.close()

     avg_f1    max_f2
0  0.450337  0.994549
