In [0]:
#@title PandasSQLWindow Examples

import pandas as pd

class PandasSQLWindow:

  def __init__(self, 
               data, 
               partition_by, 
               order_by, 
               ascending=True,
               rows_rolling=None,
               time_rolling=None):
    
    """
    SQL Window Functions in a unified, simple Pandas API.
    
    Especially helpful for working with data with numerous 'groups' 
    or for those more familiar with
    Window Functions from SQL or Apache Spark.
    
    The current list only serves to demonstrate a few functionalities
    and is by no means exhaustive. Please feel free to reach out with
    any suggestions or requests.
    
    Parameters
    ----------

    data: Pandas DataFrame

    partition_by: str or list of str
      Name(s) of groupby column(s)

    order_by: str
      Name of sorting column. For rolling functions, this column
      must be datatime-like

    ascending: bool (default=True)
      Sort ascending vs. descending
    
    rows_rolling: int (default=None)
      Number of rows to consider for rolling functions
      (e.g. rolling_min, rolling_max, rolling_mean)

    time_rolling: offset (default=None)
      Offset time period (e.g. '10s' for 10 seconds)
      to consider for rolling functions
      (e.g. rolling_min, rolling_max, rolling_mean)
    """



    if (rows_rolling is not None) and (time_rolling is not None):
      raise InputError("window_rows and window_time cannot both be specified")

    self.partition_by = partition_by
    self.order_by = order_by
    self.ascending = ascending
    self.rows_rolling = None
    self.time_rolling = None

    self.window = data.sort_values(order_by, ascending=ascending).groupby(partition_by)
    if rows_rolling is not None:
      self.rolling_window = self.window.rolling(rows_rolling, min_periods=1)
    elif time_rolling is not None:
      self.rolling_window = self.window.rolling(time_rolling, min_periods=1)
    return

  @staticmethod
  def postprocess(object, reshape=False, sort_index=True):
    if reshape: shaped = object.reset_index(level=0, drop=True)
    else: shaped = object
      
    if sort_index: 
      return shaped.sort_index()
    else: 
      return shaped
      
  def shift(self, column, periods=1):
    s = self.window[column].shift(periods=periods)
    return self.postprocess(s)
  def lag(self, column, periods=1):
    return self.shift(column, periods=periods)
  def lead(self, column, periods=1):
    return self.shift(column, periods=-periods)

  def last(self, column):
    s = self.window[column].shift().ffill()
    return self.postprocess(s)

  def rank(self, method='first'):
    s = self.window[self.order_by].rank(method=method).astype(int)
    return self.postprocess(s)

  def expanding_min(self, column):
    s = self.window[column].expanding().min()
    return self.postprocess(s, reshape=True)
  def expanding_max(self, column):
    s = self.window[column].expanding().max()
    return self.postprocess(s, reshape=True)
  def expanding_mean(self, column):
    s = self.window[column].expanding().mean()
    return self.postprocess(s, reshape=True)
  def expanding_sum(self, column):
    s = self.window[column].expanding().sum()
    return self.postprocess(s, reshape=True)

  def rolling_min(self, column):
    s = self.rolling_window[column].min()
    return self.postprocess(s, reshape=True)    
  def rolling_max(self, column):
    s = self.rolling_window[column].max()
    return self.postprocess(s, reshape=True)
  def rolling_mean(self, column):
    s = self.rolling_window[column].mean()
    return self.postprocess(s, reshape=True)
  def rolling_sum(self, column):
    s = self.rolling_window[column].sum()
    return self.postprocess(s, reshape=True)

In [0]:
import pandas as pd
import numpy as np

In [122]:
df = pd.DataFrame({'group':['a', 'b', 'b', 'c', 'c', 'c'],
                   'timestamp':[1, 2, 1, 3, 2, 1], 
                   'value': [1,2,3,4,np.nan,6]})

df

Unnamed: 0,group,timestamp,value
0,a,1,1.0
1,b,2,2.0
2,b,1,3.0
3,c,3,4.0
4,c,2,
5,c,1,6.0


In [123]:
data = df
partition_by = ['group']
order_by = ['timestamp']
rows_rolling = 2

w = PandasSQLWindow(data=data, 
                    partition_by=partition_by,
                    order_by=order_by, 
                    rows_rolling=rows_rolling)

df['count'] = w.rank(method='first')
df['value_shift'] = w.shift('value')
df['value_last'] = w.last('value')

df['expanding_sum'] = w.expanding_sum('value')
df['expanding_min'] = w.expanding_min('value')

df['rolling_mean'] = w.rolling_mean('value')
df['rolling_sum'] = w.rolling_sum('value')

# Just for ease of reading:
df.sort_values(['group', 'timestamp'])

Unnamed: 0,group,timestamp,value,count,value_shift,value_last,expanding_sum,expanding_min,rolling_mean,rolling_sum
0,a,1,1.0,1,,,1.0,1.0,1.0,1.0
2,b,1,3.0,1,,,3.0,3.0,3.0,3.0
1,b,2,2.0,2,3.0,3.0,5.0,2.0,2.5,5.0
5,c,1,6.0,1,,,6.0,6.0,6.0,6.0
4,c,2,,2,6.0,6.0,6.0,6.0,6.0,6.0
3,c,3,4.0,3,,6.0,10.0,4.0,4.0,4.0
