# Chapter 10: Interoperability With Other Python Libraries

## Converting to and from a pandas DataFrame

### How to do it...

In [1]:
import polars as pl

In [2]:
df = pl.DataFrame({
    'a': [1,2,3],
    'b': [4,5,6]
})

type(df)

polars.dataframe.frame.DataFrame

In [8]:
pandas_df = df.to_pandas()

type(pandas_df)

pandas.core.frame.DataFrame

In [12]:
df.to_pandas(use_pyarrow_extension_array=True).dtypes

a    int64[pyarrow]
b    int64[pyarrow]
dtype: object

In [5]:
df = pl.from_pandas(pandas_df)
type(df)

polars.dataframe.frame.DataFrame

In [42]:
type(pl.DataFrame(pandas_df))

polars.dataframe.frame.DataFrame

### There is more

In [43]:
s = pl.Series([1,2,3])

In [44]:
type(s.to_pandas())

pandas.core.series.Series

In [45]:
type(pl.from_pandas(s.to_pandas()))

polars.series.series.Series

## Converting to and from NumPy arrays

### How to do it...

In [18]:
import polars as pl
import numpy as np

In [22]:
arr = np.array([[1,2,3], [4,5,6]])

In [28]:
df = pl.from_numpy(arr, schema=['a', 'b'], orient='col')
df

a,b
i64,i64
1,4
2,5
3,6


In [36]:
df.to_numpy()

array([[1, 4],
       [2, 5],
       [3, 6]])

In [37]:
df.to_numpy(structured=True)

array([(1, 4), (2, 5), (3, 6)], dtype=[('a', '<i8'), ('b', '<i8')])

In [41]:
(
    df
    .with_columns(
        np.gcd(pl.col('a'), pl.col('b')).alias('gcd')
    )

)

a,b,gcd
i64,i64,i64
1,4,1
2,5,1
3,6,3


### There is more...

In [49]:
s = pl.Series([1,2,3])

In [51]:
s.to_numpy()

array([1, 2, 3])

## Interoperating with PyArrow

In [1]:
import polars as pl

In [23]:
import pyarrow.dataset as ds

file_path = '../data/venture_funding_deals_partitioned'
part = ds.partitioning(flavor='hive')
dataset = ds.dataset(file_path, partitioning=part)

dataset.head(5)

pyarrow.Table
Company: large_string
Amount: large_string
Lead investors: large_string
Valuation: large_string
Date reported: large_string
Industry: string
----
Company: [["Restaurant365"],["Madhive"],...,["Indigo"],["Chronosphere"]]
Amount: [["$135,000,000"],["$300,000,000"],...,["$250,000,000"],["$115,000,000"]]
Lead investors: [["KKR, L Catterton"],["Goldman Sachs Asset Management"],...,["Flagship Pioneering, State of Michigan Retirement System, Lingotto"],["GV"]]
Valuation: [["$1,000,000,000"],["$1,000,000,000"],...,["na"],["n/a"]]
Date reported: [["5/19/23"],["6/13/23"],...,["9/15/23"],["1/9/23"]]
Industry: [["Accounting"],["Advertising"],...,["Agriculture"],["Analytics"]]

In [24]:
df = pl.from_arrow(dataset.to_table())  
df.head()

Company,Amount,Lead investors,Valuation,Date reported,Industry
str,str,str,str,str,str
"""Restaurant365""","""$135,000,000""","""KKR, L Cattert…","""$1,000,000,000…","""5/19/23""","""Accounting"""
"""Madhive""","""$300,000,000""","""Goldman Sachs …","""$1,000,000,000…","""6/13/23""","""Advertising"""
"""Ursa Major,""","""$100,000,000""","""BlackRock, Spa…","""n/a""","""4/26/23""","""Aerospace"""
"""Indigo""","""$250,000,000""","""Flagship Pione…","""na""","""9/15/23""","""Agriculture"""
"""Chronosphere""","""$115,000,000""","""GV""","""n/a""","""1/9/23""","""Analytics"""


In [25]:
df.to_arrow()

pyarrow.Table
Company: large_string
Amount: large_string
Lead investors: large_string
Valuation: large_string
Date reported: large_string
Industry: large_string
----
Company: [["Restaurant365","Madhive","Ursa Major,","Indigo","Chronosphere",...,"Professional Fighters League","Newlight Technologies","Pivotal Commware","Via","Aquaback Technologies"]]
Amount: [["$135,000,000","$300,000,000","$100,000,000","$250,000,000","$115,000,000",...,"$100,000,000","$125,000,000","$102,000,000","$110,000,000","$110,000,000"]]
Lead investors: [["KKR, L Catterton","Goldman Sachs Asset Management","BlackRock, Space Capital","Flagship Pioneering, State of Michigan Retirement System, Lingotto","GV",...,"SRJ Sports Investments","GenZero","Gates Frontier, Tracker Capital","83North","Global Emerging Markets Group"]]
Valuation: [["$1,000,000,000","$1,000,000,000","n/a","na","n/a",...,"n/a","n/a","n/a","$3,500,000,000","n/a"]]
Date reported: [["5/19/23","6/13/23","4/26/23","9/15/23","1/9/23",...,"8/30/23","8/3

In [26]:
lf = pl.scan_pyarrow_dataset(dataset)
lf.collect().head()

Company,Amount,Lead investors,Valuation,Date reported,Industry
str,str,str,str,str,str
"""Restaurant365""","""$135,000,000""","""KKR, L Cattert…","""$1,000,000,000…","""5/19/23""","""Accounting"""
"""Madhive""","""$300,000,000""","""Goldman Sachs …","""$1,000,000,000…","""6/13/23""","""Advertising"""
"""Ursa Major,""","""$100,000,000""","""BlackRock, Spa…","""n/a""","""4/26/23""","""Aerospace"""
"""Indigo""","""$250,000,000""","""Flagship Pione…","""na""","""9/15/23""","""Agriculture"""
"""Chronosphere""","""$115,000,000""","""GV""","""n/a""","""1/9/23""","""Analytics"""


In [29]:
(
    lf
    .select('Company')
    .collect()
    .to_series()
    .to_arrow()
)

<pyarrow.lib.LargeStringArray object at 0x12fd3d9c0>
[
  "Restaurant365",
  "Madhive",
  "Ursa Major,",
  "Indigo",
  "Chronosphere",
  "AlphaSense",
  "Skims",
  "SandboxAQ",
  "Humane",
  "OpenAI",
  ...
  "Eagle Eye Networks",
  "Enfabrica",
  "Axiom Space",
  "Sierra Space",
  "Astranis",
  "Professional Fighters League",
  "Newlight Technologies",
  "Pivotal Commware",
  "Via",
  "Aquaback Technologies"
]

## Integration with DuckDB

### How to do it...

In [2]:
import polars as pl

In [3]:
import duckdb 

df = pl.DataFrame({
    'a': [1,2,3]
})

rel = duckdb.sql('SELECT * FROM df')
rel.show()

┌───────┐
│   a   │
│ int64 │
├───────┤
│     1 │
│     2 │
│     3 │
└───────┘



In [4]:
rel.pl()

a
i64
1
2
3
