# [Go to "I/O tools" in Pandas Docs](https://pandas.pydata.org/docs/user_guide/io.html)

In [1]:
import pandas as pd
import numpy as np

# 1. CSV & text files

In [2]:
# reading CSV file
df = pd.read_csv('../sample_data/data.csv', index_col=0)
df.head()

Unnamed: 0,A,B,C,D
2000-01-01,-1.662334,1.754375,2.277854,-1.22205
2000-01-02,-3.170328,2.6406,3.25295,0.325672
2000-01-03,-4.905516,3.187861,1.720954,-1.209723
2000-01-04,-5.109194,3.717737,2.365512,0.338022
2000-01-05,-5.705884,2.990343,1.962035,0.355969


In [3]:
# writing to CSV
df.to_csv('../sample_data/data.csv')

# 2. JSON

In [4]:
# writing/converting to JSON
json_df = df[:3].to_json()

import json, pprint
pprint.pprint(json.loads(json_df))

{'A': {'2000-01-01': -1.6623337071,
       '2000-01-02': -3.1703279261,
       '2000-01-03': -4.9055156169},
 'B': {'2000-01-01': 1.7543746416,
       '2000-01-02': 2.6405996729,
       '2000-01-03': 3.1878605419},
 'C': {'2000-01-01': 2.2778541809,
       '2000-01-02': 3.2529501215,
       '2000-01-03': 1.7209538329},
 'D': {'2000-01-01': -1.2220504344,
       '2000-01-02': 0.3256715133,
       '2000-01-03': -1.2097228282}}


In [5]:
# reading from JSON
pd.read_json(json_df)

Unnamed: 0,A,B,C,D
2000-01-01,-1.662334,1.754375,2.277854,-1.22205
2000-01-02,-3.170328,2.6406,3.25295,0.325672
2000-01-03,-4.905516,3.187861,1.720954,-1.209723


In [6]:
# reading from HTML
df_html = pd.read_html('https://en.wikipedia.org/wiki/International_wheat_production_statistics')[0]
df_html.head()

Unnamed: 0,Country,2017[1],2016[1],2015[2],2014[3],2013[4],2012[5],2011[5],2010[5],2009[5],...,2005 [6],2004[6],2003[6],2002,2001,2000,1999,1998,1997,1996
0,European Union,150.2,142.7,161.4,157.2,143.3,134.5,140.0,136.5,138.5,...,135.4,149.4,111.7,133.6,126.6,132.4,123.1,134.1,126.4,124.3
1,China,134.3,131.7,130.2,126.2,121.7,125.6,117.4,115.2,115.1,...,96.3,91.6,86.5,90.3,93.9,99.7,113.9,109.7,123.3,110.6
2,India,98.5,93.5,86.5,94.5,93.5,94.9,86.9,80.7,80.7,...,72.0,72.1,65.1,72.8,69.7,76.4,70.8,65.9,69.4,62.6
3,Russia,85.9,73.3,61.8,59.7,52.1,37.7,56.2,41.5,61.7,...,47.6,45.4,34.1,50.6,47.0,34.5,31.0,27.0,44.3,34.9
4,United States,47.3,62.9,55.8,55.4,60.0,61.8,54.4,60.1,60.3,...,57.1,58.7,63.8,44.1,53.3,60.8,62.7,69.4,67.5,62.0


In [7]:
# writing to HTML
print(pd.DataFrame(np.random.randn(2, 2), columns=['A', 'B']).to_html())

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>A</th>
      <th>B</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>0.234269</td>
      <td>0.121848</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1.184361</td>
      <td>-0.989412</td>
    </tr>
  </tbody>
</table>


# 4. Excel files

In [8]:
# reading from Excel
df_excel = pd.read_excel('../sample_data/data.xlsx', sheet_name='Sheet1', index_col=0)
df_excel.head()

Unnamed: 0,A,B,C,D
2000-01-01,-1.662334,1.754375,2.277854,-1.22205
2000-01-02,-3.170328,2.6406,3.25295,0.325672
2000-01-03,-4.905516,3.187861,1.720954,-1.209723
2000-01-04,-5.109194,3.717737,2.365512,0.338022
2000-01-05,-5.705884,2.990343,1.962035,0.355969


In [9]:
# writing to Excel
df_excel.to_excel('../sample_data/data.xlsx', sheet_name='Sheet1')

# 5. OpenDocument Spreadsheets

In [10]:
# reading from ODS
df_ods = pd.read_excel('../sample_data/data.ods', engine='odf', index_col=0)
df_ods.head()

Unnamed: 0,A,B,C,D
2000-01-01,-1.662334,1.754375,2.277854,-1.22205
2000-01-02,-3.170328,2.6406,3.25295,0.325672
2000-01-03,-4.905516,3.187861,1.720954,-1.209723
2000-01-04,-5.109194,3.717737,2.365512,0.338022
2000-01-05,-5.705884,2.990343,1.962035,0.355969


# 6. Binary Excel

In [11]:
# reading from Binary Excel
df_be = pd.read_excel('../sample_data/data.xlsb', engine='pyxlsb', index_col=0)
df_be.head()

Unnamed: 0,A,B,C,D
36526,-1.662334,1.754375,2.277854,-1.22205
36527,-3.170328,2.6406,3.25295,0.325672
36528,-4.905516,3.187861,1.720954,-1.209723
36529,-5.109194,3.717737,2.365512,0.338022
36530,-5.705884,2.990343,1.962035,0.355969


# 7. Clipboard

Copy this table, 

Y | x1 | x2 | x3
--- |--- | --- | ---
 0.5 | 7.2 | 5.1 | 0.9 
 1.1 | 8.4 | 3.7 | 2.5 
 
 then run the cell below.

In [12]:
pd.read_clipboard()

Unnamed: 0,Y,x1,x2,x3
0,0.5,7.2,5.1,0.9
1,1.1,8.4,3.7,2.5


# 8. Pickling

All pandas objects are equipped with to_pickle methods which use Python’s cPickle module to save data structures to disk using the pickle format.

In [13]:
# reading from disk
pd.read_pickle('../sample_data/data.pkl').head()

Unnamed: 0,A,B,C,D
2000-01-01,-1.662334,1.754375,2.277854,-1.22205
2000-01-02,-3.170328,2.6406,3.25295,0.325672
2000-01-03,-4.905516,3.187861,1.720954,-1.209723
2000-01-04,-5.109194,3.717737,2.365512,0.338022
2000-01-05,-5.705884,2.990343,1.962035,0.355969


In [14]:
# saving to disk
df.to_pickle('../sample_data/data.pkl')

> **Note:** Loading pickled data received from untrusted sources can be unsafe.

## Compressed pickle files
The compression types of `gzip`, `bz2`, `xz` are supported for reading and writing. The `zip` file format only supports reading and must contain only one data file to be read.

In [15]:
df = pd.DataFrame({
        'A': np.random.randn(1000),
        'B': 'foo',
        'C': pd.date_range('20130101', periods=1000, freq='s')})
# inferring compression format from extension
df.to_pickle("../sample_data/data.pkl.gz")

#'infer' is default, as above
df.to_pickle("../sample_data/data.pkl.xz", compression="infer")

# explicitly setting compression format
df.to_pickle("../sample_data/data.pkl.compress", compression="gzip") 

In [16]:
pd.read_pickle("../sample_data/data.pkl.xz").head()

Unnamed: 0,A,B,C
0,0.988821,foo,2013-01-01 00:00:00
1,0.39204,foo,2013-01-01 00:00:01
2,0.4536,foo,2013-01-01 00:00:02
3,-0.658292,foo,2013-01-01 00:00:03
4,-0.719088,foo,2013-01-01 00:00:04


# 9. HDF5

`HDFStore` is a dict-like object which reads and writes pandas using the high performance `HDF5` format using the excellent `PyTables` library.

Objects can be written to the file just like adding key-value pairs to a `dict`.

In [17]:
store = pd.HDFStore('../sample_data/store.h5')
print(store)

<class 'pandas.io.pytables.HDFStore'>
File path: ../sample_data/store.h5



In [18]:
# saving to HDF5
store.put('s', df.A)  # == store['s'] = df.A
store['df2'] = df[['B','C']]

In [19]:
# reading from HDF5
store['df2'].head()

Unnamed: 0,B,C
0,foo,2013-01-01 00:00:00
1,foo,2013-01-01 00:00:01
2,foo,2013-01-01 00:00:02
3,foo,2013-01-01 00:00:03
4,foo,2013-01-01 00:00:04


# 10. Feather

Feather is designed to faithfully serialize and de-serialize `DataFrames`, supporting all of the pandas dtypes, including extension dtypes such as `categorical` and `datetime` with tz.

In [20]:
df_f = pd.DataFrame({'a': list('abc'),
                     'b': list(range(1, 4)),
                     'c': np.arange(3, 6).astype('u1'),
                     'd': np.arange(4.0, 7.0, dtype='float64'),
                     'e': [True, False, True],
                     'f': pd.Categorical(list('abc')),
                     'g': pd.date_range('20130101', periods=3),
                     'h': pd.date_range('20130101', periods=3, tz='US/Eastern'),
                     'i': pd.date_range('20130101', periods=3, freq='ms')})
print(df_f.dtypes)
df_f.head()

a                        object
b                         int64
c                         uint8
d                       float64
e                          bool
f                      category
g                datetime64[ns]
h    datetime64[ns, US/Eastern]
i                datetime64[ns]
dtype: object


Unnamed: 0,a,b,c,d,e,f,g,h,i
0,a,1,3,4.0,True,a,2013-01-01,2013-01-01 00:00:00-05:00,2013-01-01 00:00:00.000
1,b,2,4,5.0,False,b,2013-01-02,2013-01-02 00:00:00-05:00,2013-01-01 00:00:00.001
2,c,3,5,6.0,True,c,2013-01-03,2013-01-03 00:00:00-05:00,2013-01-01 00:00:00.002


In [21]:
# writing to feather
df_f.to_feather('../sample_data/data.feather') 

# reading from feather
pd.read_feather('../sample_data/data.feather').dtypes # data types preserved

a                        object
b                         int64
c                         uint8
d                       float64
e                          bool
f                      category
g                datetime64[ns]
h    datetime64[ns, US/Eastern]
i                datetime64[ns]
dtype: object

# 11. Paraquet

Parquet too is designed to faithfully serialize and de-serialize `DataFrame`s, supporting all of the pandas dtypes.

In [22]:
# writing to paraquet
df_f.to_parquet('../sample_data/data.paraquet')

# reading from paraquet
pd.read_parquet('../sample_data/data.paraquet').head()

Unnamed: 0,a,b,c,d,e,f,g,h,i
0,a,1,3,4.0,True,a,2013-01-01,2013-01-01 00:00:00-05:00,2013-01-01 00:00:00.000
1,b,2,4,5.0,False,b,2013-01-02,2013-01-02 00:00:00-05:00,2013-01-01 00:00:00.001
2,c,3,5,6.0,True,c,2013-01-03,2013-01-03 00:00:00-05:00,2013-01-01 00:00:00.002


In [23]:
pd.read_parquet('../sample_data/data.paraquet').dtypes

a                        object
b                         int64
c                         uint8
d                       float64
e                          bool
f                      category
g                datetime64[ns]
h    datetime64[ns, US/Eastern]
i                datetime64[ns]
dtype: object

# 12. SQL

In [24]:
import sqlite3

# creating an SQLite database in RAM
conn = sqlite3.connect(':memory:')

users = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})

# writing a DataFrame to an SQL database table
users.to_sql('users', con=conn)

In [25]:
# reading data from an SQL database table
conn.execute("SELECT * FROM users").fetchall()

[(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]

In [26]:
# creating a DataFrame from an SQL database table
pd.read_sql("SELECT * FROM users", con=conn, index_col='index')

Unnamed: 0_level_0,name
index,Unnamed: 1_level_1
0,User 1
1,User 2
2,User 3


# 13. STATA

In [27]:
# writing to STATA
df.to_stata('../sample_data/stata.dta')

# reading from STATA
pd.read_stata('../sample_data/stata.dta').head()

Unnamed: 0,index,A,B,C
0,0,0.988821,foo,2013-01-01 00:00:00
1,1,0.39204,foo,2013-01-01 00:00:01
2,2,0.4536,foo,2013-01-01 00:00:02
3,3,-0.658292,foo,2013-01-01 00:00:03
4,4,-0.719088,foo,2013-01-01 00:00:04


In [28]:
# Specifying a chunksize yields a StataReader instance that can be used as an iterator.
reader = pd.read_stata('../sample_data/stata.dta', chunksize=250)

for df in reader:
    print(df.shape)


(250, 4)
(250, 4)
(250, 4)
(250, 4)


# 14. SAS

Only reading from SAS is supported.

In [29]:
#  df = pd.read_sas('sas_data.sas7bdat')



#  def do_something(chunk):
#      pass
#
#  rdr = pd.read_sas('sas_xport.xpt', chunk=100000)
#  for chunk in rdr:
#      do_something(chunk)

# 15. SPSS
Only reading from SPSS files is supported.

In [30]:
#  df = pd.read_spss('spss_data.sav')