# Snowpark Pandas API Examples

In [1]:
import modin.pandas as pd
# Import the Snowpark pandas plugin for modin
import snowflake.snowpark.modin.plugin

# Create a Snowpark session with a default connection
from snowflake.snowpark.session import Session
session = Session.builder.create()



In [2]:
# Create a Snowpark Pandas DataFrame with sample data
df = pd.DataFrame([['a', 2.1, 1],['b', None, 2],['c', 6.3, 3]], columns=["COL_STR", "COL_FLOAT", "COL_INT"])

In [3]:
df

Unnamed: 0,COL_STR,COL_FLOAT,COL_INT
0,a,2.1,1
1,b,,2
2,c,6.3,3


In [4]:
df.to_snowflake("pandas_test", if_exists='replace',index=False)

In [5]:
# Create a dataframe out of a Snowflake table.
df = pd.read_snowflake('pandas_test')

In [6]:
df.shape

(3, 3)

In [7]:
df.head(2)

Unnamed: 0,COL_STR,COL_FLOAT,COL_INT
0,a,2.1,1
1,b,,2


In [8]:
df.dropna(subset=["COL_FLOAT"], inplace=True)

In [9]:
df

Unnamed: 0,COL_STR,COL_FLOAT,COL_INT
0,a,2.1,1
2,c,6.3,3


In [10]:
df.shape

(2, 3)

In [11]:
df.dtypes

COL_STR       object
COL_FLOAT    float64
COL_INT        int64
dtype: object

In [13]:
df.reset_index(drop=True).to_snowflake('pandas_test2', if_exists='replace',index=True, index_label=['row_pos'])

## IO (Read and Write)

In [14]:
# Reading and writing to Snowflake
df = pd.DataFrame({"fruit": ["apple", "orange"], "size": [3.4, 5.4], "weight": [1.4, 3.2]})
df.to_snowflake("test_table", if_exists="replace", index=False )

In [15]:
df_read = pd.read_snowflake("test_table")
df_read

Unnamed: 0,fruit,size,weight
0,apple,3.4,1.4
1,orange,5.4,3.2


In [16]:
# Generate sample CSV file
with open("data.csv", "w") as f: 
    f.write('fruit,size,weight\napple,3.4,1.4\norange,5.4,3.2')

In [17]:
# Read from local CSV file
df_csv = pd.read_csv("data.csv")

In [18]:
# Generate sample JSON file
with open("data.json", "w") as f: 
    f.write('{"fruit":"apple", "size":3.4, "weight":1.4},{"fruit":"orange", "size":5.4, "weight":3.2}')

In [19]:
# Read from local JSON file
df_json = pd.read_json('data.json')

In [20]:
# Upload data.json and data.csv to Snowflake stage named @TEST_STAGE
# Read CSV and JSON file from stage
df_csv = pd.read_csv('@TEST_STAGE/data.csv')
df_json = pd.read_json('@TEST_STAGE/data.json')

## Indexing

In [58]:
df = pd.DataFrame({"a": [1,2,3], "b": ["x", "y", "z"]})

In [59]:
df.columns

Index(['a', 'b'], dtype='object')

In [60]:
df.index

Index([0, 1, 2], dtype='int64')

In [61]:
df["a"]

0    1
1    2
2    3
Name: a, dtype: int64

In [62]:
df["b"]

0    x
1    y
2    z
Name: b, dtype: object

In [63]:
df.iloc[0,1]

'x'

In [64]:
df.loc[df["a"] > 2]

Unnamed: 0,a,b
2,3,z


In [66]:
df.columns = ["c", "d"]
df

Unnamed: 0,c,d
0,1,x
1,2,y
2,3,z


In [67]:
df = df.set_index("c")
df

Unnamed: 0_level_0,d
c,Unnamed: 1_level_1
1,x
2,y
3,z


In [68]:
df.rename(columns={"d": "renamed"})

Unnamed: 0_level_0,renamed
c,Unnamed: 1_level_1
1,x
2,y
3,z


## Missing values

In [70]:
import numpy as np
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                [3, 4, np.nan, 1],
                [np.nan, np.nan, np.nan, np.nan],
                [np.nan, 3, np.nan, 4]],
                columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,
3,,3.0,,4.0


In [71]:
df.isna()

Unnamed: 0,A,B,C,D
0,True,False,True,False
1,False,False,True,False
2,True,True,True,True
3,True,False,True,False


In [72]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0.0
1,3.0,4.0,0.0,1.0
2,0.0,0.0,0.0,0.0
3,0.0,3.0,0.0,4.0


In [73]:
df.dropna(how="all")

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
3,,3.0,,4.0


## Type conversion


In [75]:
df = pd.DataFrame({"int": [1,2,3], "str": ["4", "5", "6"]})
df

Unnamed: 0,int,str
0,1,4
1,2,5
2,3,6


In [76]:
df_float = df.astype(float)
df_float

Unnamed: 0,int,str
0,1.0,4.0
1,2.0,5.0
2,3.0,6.0


In [77]:
df_float.dtypes

int    float64
str    float64
dtype: object

In [78]:
pd.to_numeric(df.str)

0    4.0
1    5.0
2    6.0
Name: str, dtype: float64

In [79]:
df = pd.DataFrame({'year': [2015, 2016],
                'month': [2, 3],
                'day': [4, 5]})
pd.to_datetime(df)

0   2015-02-04
1   2016-03-05
dtype: datetime64[ns]

## Binary Operations

In [80]:
df_1 = pd.DataFrame([[1,2,3],[4,5,6]])
df_2 = pd.DataFrame([[6,7,8]])
df_1.add(df_2)

Unnamed: 0,0,1,2
0,7.0,9.0,11.0
1,,,


In [81]:
s1 = pd.Series([1, 2, 3])
s2 = pd.Series([2, 2, 2])
s1 + s2

0    3
1    4
2    5
dtype: int64

In [82]:
df = pd.DataFrame({"A": [1,2,3], "B": [4,5,6]})
df["A+B"] = df["A"] + df["B"]
df

Unnamed: 0,A,B,A+B
0,1,4,5
1,2,5,7
2,3,6,9


## Aggregations

In [83]:
df = pd.DataFrame([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9],
                [np.nan, np.nan, np.nan]],
                columns=['A', 'B', 'C'])
df.agg(['sum', 'min'])

Unnamed: 0,A,B,C
sum,12.0,15.0,18.0
min,1.0,2.0,3.0


In [84]:
df.median()

A    4.0
B    5.0
C    6.0
dtype: float64

## Merge

In [86]:
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [1, 2, 3, 5]})
df1

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,5


In [87]:
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [5, 6, 7, 8]})
df2

Unnamed: 0,rkey,value
0,foo,5
1,bar,6
2,baz,7
3,foo,8


In [88]:
df1.merge(df2, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,value_x,rkey,value_y
0,foo,1,foo,5
1,foo,1,foo,8
2,bar,2,bar,6
3,baz,3,baz,7
4,foo,5,foo,5
5,foo,5,foo,8


In [89]:
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
df

Unnamed: 0,key,A
0,K0,A0
1,K1,A1
2,K2,A2
3,K3,A3
4,K4,A4
5,K5,A5


In [90]:
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                    'B': ['B0', 'B1', 'B2']})
df.join(other, lsuffix='_caller', rsuffix='_other')

Unnamed: 0,key_caller,A,key_other,B
0,K0,A0,K0,B0
1,K1,A1,K1,B1
2,K2,A2,K2,B2
3,K3,A3,,
4,K4,A4,,
5,K5,A5,,


## Groupby

In [92]:
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon','Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})
df

Unnamed: 0,Animal,Max Speed
0,Falcon,380.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [93]:
df.groupby(['Animal']).mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,375.0
Parrot,25.0


## Pivot

In [94]:
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                        "bar", "bar", "bar", "bar"],
                "B": ["one", "one", "one", "two", "two",
                        "one", "one", "two", "two"],
                "C": ["small", "large", "large", "small",
                        "small", "large", "small", "small",
                        "large"],
                "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
df

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [95]:
pd.pivot_table(df, values='D', index=['A', 'B'],
                   columns=['C'], aggfunc="sum")

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,5
bar,two,7.0,6
foo,one,4.0,1
foo,two,,6


In [96]:
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two','two'],
                'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                'baz': [1, 2, 3, 4, 5, 6],
                'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
df

Unnamed: 0,foo,bar,baz,zoo
0,one,A,1,x
1,one,B,2,y
2,one,C,3,z
3,two,A,4,q
4,two,B,5,w
5,two,C,6,t


## MultiIndex

In [97]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]

tuples = list(zip(*arrays))

index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [98]:
import numpy as np
pd.Series(np.random.randn(8), index=index)

first  second
bar    one      -1.881666
       two       0.595870
baz    one      -0.283088
       two      -1.223938
foo    one      -0.102025
       two       0.629615
qux    one      -0.973516
       two       0.750192
dtype: float64