In [None]:
from snowflake.snowpark.context import get_active_session
session = get_active_session()

import logging
logging.getLogger("cmdstanpy").setLevel(logging.WARNING)
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [None]:
select
    o_custkey,
    o_orderdate,
    o_totalprice
from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS
limit 10

In [None]:
import numpy as np
from datetime import datetime, timedelta
import pandas as pd

class OrderGenerator:
    def __init__(
        self,
        # Basic parameters
        start_date='1992-01-01',
        end_date='1998-12-31',
        target_daily_total=100_000_000,
        target_daily_orders=500,
        
        # Trend parameters
        annual_growth_rate=0.15,        # 15% annual growth
        order_value_growth_rate=0.05,   # 5% annual growth in order values
        
        # Seasonal parameters
        holiday_peak_day=350,           # Peak shopping day (Dec 16)
        holiday_effect_magnitude=1.0,   # Strength of holiday effect
        seasonal_baseline=0.8,          # Minimum seasonal multiplier
        seasonal_spread=1000,           # Controls how spread out the holiday effect is
        
        # Weekly parameters
        weekend_dip=0.85,              # Weekend order multiplier
        weekday_boost=1.1,             # Weekday order multiplier
        
        # Value distribution parameters
        pareto_shape=2.0,              # Shape parameter for order values
        min_value_factor=0.3,          # Minimum order value as fraction of average
        value_noise_stddev=0.15,       # Standard deviation for order value noise
        
        # Random seed for reproducibility
        random_seed=None
    ):
        self.start_date = pd.to_datetime(start_date)
        self.end_date = pd.to_datetime(end_date)
        self.target_daily_total = target_daily_total
        self.target_daily_orders = target_daily_orders
        
        # Store all other parameters
        self.annual_growth_rate = annual_growth_rate
        self.order_value_growth_rate = order_value_growth_rate
        self.holiday_peak_day = holiday_peak_day
        self.holiday_effect_magnitude = holiday_effect_magnitude
        self.seasonal_baseline = seasonal_baseline
        self.seasonal_spread = seasonal_spread
        self.weekend_dip = weekend_dip
        self.weekday_boost = weekday_boost
        self.pareto_shape = pareto_shape
        self.min_value_factor = min_value_factor
        self.value_noise_stddev = value_noise_stddev
        
        # Derived parameters
        self.avg_order_value = target_daily_total / target_daily_orders
        self.min_order_value = self.avg_order_value * self.min_value_factor
        
        if random_seed is not None:
            np.random.seed(random_seed)
    
    def seasonal_effect(self, day_of_year):
        """Stronger effect during holiday season"""
        holiday_effect = np.exp(
            -((day_of_year - self.holiday_peak_day) ** 2) / 
            self.seasonal_spread
        ) * self.holiday_effect_magnitude
        return np.maximum(self.seasonal_baseline + holiday_effect, 0)
    
    def weekly_effect(self, day_of_week):
        """Weekend dips in orders"""
        return self.weekend_dip if day_of_week in [5, 6] else self.weekday_boost
    
    def trend_effect(self, years_passed):
        """Long-term growth trend"""
        return np.power(1 + self.annual_growth_rate, years_passed)
    
    def generate_order_value(self, years_passed):
        """Generate order values following a Pareto distribution"""
        u = np.random.random()
        value = self.min_order_value / np.power(1 - u, 1/self.pareto_shape)
        value = value * np.power(1 + self.order_value_growth_rate, years_passed)
        noise = np.random.normal(1, self.value_noise_stddev)
        return round(value * noise)
    
    def generate_clerk(self):
        """Generate clerk IDs matching TPCH format"""
        clerk_id = np.random.randint(1000)
        return f"Clerk#{clerk_id:09d}"
    
    def generate_customer(self, num_customers=149999):
        """Generate customer IDs matching TPCH format"""
        return np.random.randint(num_customers)
    
    def generate_orders(self):
        """Generate supplementary orders with realistic patterns"""
        orders = []
        current_date = self.start_date
        
        while current_date <= self.end_date:
            day_of_year = current_date.dayofyear
            years_passed = (current_date - self.start_date).days / 365
            
            seasonal = self.seasonal_effect(day_of_year)
            weekly = self.weekly_effect(current_date.weekday())
            trend = self.trend_effect(years_passed)
            
            target_orders = round(
                self.target_daily_orders * 
                seasonal * weekly * trend
            )
            
            for _ in range(target_orders):
                order = {
                    'o_orderdate': current_date,
                    'o_totalprice': self.generate_order_value(years_passed),
                    'o_orderstatus': 'O',
                    'o_clerk': self.generate_clerk(),
                    'o_custkey': self.generate_customer()
                }
                orders.append(order)
            
            current_date += timedelta(days=1)
        
        df = pd.DataFrame(orders)
        df = df.sort_values('o_orderdate')
        df['o_orderkey'] = range(len(df))
        df['o_orderkey'] = df['o_orderkey'] + 1_500_000  # Offset to avoid conflicts
        
        return df

def generate_and_save_synthetic_data():
    """Generate orders and save to CSV"""
    # Example: Generate 2 years of data with pronounced patterns
    params = {
        'start_date': '1992-01-01',
        'end_date': '1998-08-02',
        'target_daily_total': 100_000_000,
        'target_daily_orders': 500,
        'holiday_effect_magnitude': 1.2,
        'weekend_dip': 0.8,
        'annual_growth_rate': 0.15,
        'value_noise_stddev': 0.15
    }
    
    generator = OrderGenerator(**params)
    df = generator.generate_orders()
    #save the synthetic data to a temporary table
    filename = 'synthetic_orders'
    df.to_csv(filename + '.csv', index=False)
    print(f"Orders saved to CSV {filename}.csv")
    csv_df = pd.read_csv(filename + '.csv')
    csv_df['o_orderdate'] = pd.to_datetime(df['o_orderdate'])
    table_df = session.create_dataframe(csv_df)
    table_df.write.mode("overwrite").save_as_table(filename, table_type="temporary")
    print(f"Order saved to temporary table {filename}")
    return

# Generate and save orders
generate_and_save_synthetic_data()

# Growth Accounting

In [None]:
with synthetic as (

    select
        "o_custkey" as id,
        to_date("o_orderdate") as o_orderdate,
        CAST("o_totalprice" AS NUMERIC) as o_totalprice
    from synthetic_orders
    --SAMPLE (1000000 rows)

),

original as (
    
    select
        o_custkey as id,
        o_orderdate,
        o_totalprice
    from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS
    --SAMPLE (1000000 rows)

)

select * from synthetic
union all 
select * from original

In [None]:
select
    id,
    date_trunc(year, o_orderdate) as order_year,
    sum(o_totalprice) as total
from {{ orders }}
group by all
order by id, order_year

In [None]:
annual_customer_orders_df = annual_customer_orders.to_pandas()

#pivot data to add row for each id:year with no revenue
result = annual_customer_orders_df.pivot_table(
    index='ID',
    columns='ORDER_YEAR', 
    values='TOTAL',
    fill_value=0
).reset_index().melt(
    id_vars='ID',
    var_name='ORDER_YEAR',
    value_name='TOTAL'
)

# save the dataframe as table for SQL querying 
df = session.create_dataframe(result)
df.write.mode("overwrite").save_as_table("annual_customer_orders", table_type="temporary")

In [None]:
select * from annual_customer_orders
order by id, order_year
limit 10

In [None]:
with windowed as (
    
    select
        *,
        sum(total) over(partition by id order by order_year asc) as lifetime_spend,
        coalesce(lag(total) over(partition by id order by order_year asc), 0) as previous_year_total,
    from annual_customer_orders

)

select *,
  case
    when total = previous_year_total and total > 0 then 'retained'
    when total > 0 and previous_year_total = 0 and lifetime_spend = total then 'new'
    when total = 0 and previous_year_total > 0 then 'churned'
    when total > previous_year_total and previous_year_total > 0 then 'expanded'
    when total < previous_year_total and previous_year_total > 0 then 'contracted'
    when total > 0 and previous_year_total = 0 and lifetime_spend > total then 'resurrected'
  else 'irrelevant' end as category,
  case category
    when 'retained' then 0
    when 'new' then total
    when 'churned' then (-1 * previous_year_total)
    when 'expanded' then total - previous_year_total
    when 'contracted' then (-1 * (previous_year_total - total))
    when 'resurrected' then total
  else 0 end as net_change
from windowed
order by id, order_year

In [None]:
select
    date_part(year, order_year) as order_year,
    category,
    round(sum(total)) as total,
    round(sum(net_change)) as net_change
from {{ labeled_annual_customer_orders }}
group by all

In [None]:
import streamlit as st
# Option to define dictionary to color code each category, may need to use matplotlib
# Option to use altair for better control of ticks on Y axis
st.bar_chart(annual_growth_labels, x='ORDER_YEAR', y='NET_CHANGE', color='CATEGORY', height=750)

In [None]:
df = labeled_annual_customer_orders.to_pandas()
button_csv = df.to_csv().encode("utf-8")
st.download_button(label="Download", data=button_csv, file_name="growth_accounting.csv", mime="text/csv")

# Forecasting

In [None]:
select
    date_trunc(day, o_orderdate) as order_date,
    sum(o_totalprice) as sum_revenue,
    count(*) as num_orders
from {{ orders }}
group by 1
order by order_date asc

In [None]:
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly

df = daily_order_data.to_pandas()
prophet_df = df.rename(columns={'ORDER_DATE': 'ds', 'SUM_REVENUE': 'y'})
st.line_chart(prophet_df, x='ds', y='y')

In [None]:
m = Prophet()
try:
    m.fit(prophet_df)
except Exception as err:
    print(Exception, err)

future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
fig1 = m.plot(forecast)

In [None]:
fig2 = m.plot_components(forecast)

In [None]:
df = pd.DataFrame({
    'ds': forecast['ds'],
    'y': m.history['y'],
    # Only show yhat for future dates
    'yhat': np.where(forecast['ds'] > m.history['ds'].max(), forecast['yhat'], np.nan)
})

st.line_chart(df, x='ds', y=['y', 'yhat'])

# Customer Segmentation

In [None]:
select *
from ADHOC_ANALYSIS.USER_UPLOADS.SP500_COMPANY_LIST
limit 10

In [None]:
import requests

def get_wiki_extract(title):
    # Base URL for Wikipedia's API
    url = "https://en.wikipedia.org/w/api.php"
    
    # Parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "exintro": True,  # Only get the intro section
        "explaintext": True,  # Get plain text instead of HTML
    }
    
    # Make the request
    response = requests.get(url, params=params)
    
    # Check if request was successful
    if response.status_code == 200:
        data = response.json()
        # Navigate through the JSON response to get the extract
        pages = data["query"]["pages"]
        # Get the first (and only) page's extract
        page = list(pages.values())[0]
        return page.get("extract", "No extract available")
    else:
        return f"Error: {response.status_code}"

In [None]:
df = sample_company_data.to_pandas()
company_names = df['NAME'].tolist()
csv_list = []

print("extracting descriptions")

for name in company_names:
    try:
        extract = get_wiki_extract(name.replace(" ", "_"))
        print(f'extracted description of {name} from Wikipedia')
    except Exception as e:
        print(f"Error getting Wikipedia extract for {name}: {str(e)}")
        extract = "None available"
        
    csv_list.append((name, extract))

print("finished extracting descriptions")

# save the dataframe as table for SQL querying 
df = pd.DataFrame(csv_list, columns=['name', 'description'])
df = session.create_dataframe(df)
df.write.mode("overwrite").save_as_table("prospects", table_type="temporary")

In [None]:
select "name", "description" from prospects

In [None]:
select 
    "name",
    "description",
    snowflake.cortex.classify_text(
        "description",
        ['extremely likely', 'somewhat likely', 'unlikely'],
        {
            'task_description': 'Return the likelihood that this company would be interested in attending a webinar showcasing the GTM utility of Snowflake Notebooks and Anaconda Python Packages.'
        }
    ):label::STRING as persona_likelihood,
    snowflake.cortex.classify_text(
        "description",
        ['healthcare', 'finance', 'retail', 'technology', 'communication', 'other'],
        {
            'task_description': 'Return the most likely industry of the company based on this description.'
        }
    ):label::STRING as industry,
    snowflake.cortex.classify_text(
        "description",
        ['California', 'South', 'Northeast', 'Midatlantic', 'Midwest', 'Pacific Northwest', 'Outsite the US'],
        {
            'task_description': 'Return the most likely region the company is headquartered in based on this description.'
        }
    ):label::STRING as region
from prospects
where "description" is not null and "description" != ''
limit 10
-- other class. ideas: industry, main product, region