# [Go to "I/O tools" in Pandas Docs](https://pandas.pydata.org/docs/user_guide/io.html)

In [1]:
import pandas as pd
import numpy as np

# 1. CSV & text files

## 1.1 Reading from CSV

The multipurpose [read_csv][1] function can read in data from `csv`, `tsv` and many other text file formats, with dozens of customizable options.

[1]: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html#pandas-read-csv

In [2]:
df = pd.read_csv('../sample_data/data.csv', index_col=0)
df.head()

Unnamed: 0,A,B,C,D
2000-01-01,-1.662334,1.754375,2.277854,-1.22205
2000-01-02,-3.170328,2.6406,3.25295,0.325672
2000-01-03,-4.905516,3.187861,1.720954,-1.209723
2000-01-04,-5.109194,3.717737,2.365512,0.338022
2000-01-05,-5.705884,2.990343,1.962035,0.355969


## 1.2 Writing to CSV

In [3]:
print(df[:7].to_csv(index=False))

A,B,C,D
-1.6623337070901472,1.7543746415932018,2.2778541809023,-1.2220504343864012
-3.1703279261274835,2.6405996729048367,3.2529501215137784,0.3256715133000285
-4.905515616900304,3.1878605419487296,1.7209538329181118,-1.209722828232634
-5.1091938002074695,3.717736594124175,2.365512307709186,0.3380217834306869
-5.705883612289831,2.9903432462557484,1.962034673597272,0.35596870564787664
-5.866768004749828,2.342381096273649,2.552775433719344,-2.0546820192594577
-7.567873385486995,3.9830143895917134,2.7706380131859936,-3.684084472249247



In [4]:
# writing to a file
df.to_csv('../sample_data/data.csv')

# 2. JSON

## 2.1. Reading from JSON

In [5]:
df = pd.read_json('../sample_data/data.json')
df.head()

Unnamed: 0,A,B,C,D
2000-01-01,-1.662334,1.754375,2.277854,-1.22205
2000-01-02,-3.170328,2.6406,3.25295,0.325672
2000-01-03,-4.905516,3.187861,1.720954,-1.209723
2000-01-04,-5.109194,3.717737,2.365512,0.338022
2000-01-05,-5.705884,2.990343,1.962035,0.355969


## 2.2 Writing to JSON

In [6]:
print(df[:3].to_json(orient='records', indent=2))

[
  {
    "A":-1.6623337071,
    "B":1.7543746416,
    "C":2.2778541809,
    "D":-1.2220504344
  },
  {
    "A":-3.1703279261,
    "B":2.6405996729,
    "C":3.2529501215,
    "D":0.3256715133
  },
  {
    "A":-4.9055156169,
    "B":3.1878605419,
    "C":1.7209538329,
    "D":-1.2097228282
  }
]


In [7]:
# Writing to a file
df.to_json('../sample_data/data.json')

# 3. HTML

## 3.1 Reading from HTML

In [8]:
# reading from HTML
df_html = pd.read_html('https://en.wikipedia.org/wiki/International_wheat_production_statistics')[0]
df_html.head()

Unnamed: 0,Country,2018[1],2017[1],2016[1],2015[2],2014[3],2013[4],2012[5],2011[5],2010[5],...,2005 [6],2004[6],2003[6],2002,2001,2000,1999,1998,1997,1996
0,China,131.4,134.3,131.7,130.2,126.2,121.7,125.6,117.4,115.2,...,96.3,91.6,86.5,90.3,93.9,99.7,113.9,109.7,123.3,110.6
1,India,99.7,98.5,93.5,86.5,94.5,93.5,94.9,86.9,80.7,...,72.0,72.1,65.1,72.8,69.7,76.4,70.8,65.9,69.4,62.6
2,Russia,72.1,85.9,73.3,61.8,59.7,52.1,37.7,56.2,41.5,...,47.6,45.4,34.1,50.6,47.0,34.5,31.0,27.0,44.3,34.9
3,United States,51.3,47.3,62.9,55.8,55.4,60.0,61.8,54.4,60.1,...,57.1,58.7,63.8,44.1,53.3,60.8,62.7,69.4,67.5,62.0
4,France,35.8,36.9,29.5,42.8,39.0,38.6,40.3,38.0,38.2,...,36.9,39.7,30.5,38.9,31.5,37.5,37.2,39.8,33.9,35.9


## 3.2 Writing to HTML

In [9]:
print(pd.DataFrame(np.random.randn(2, 2), columns=['A', 'B']).to_html())

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>A</th>
      <th>B</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>2.050395</td>
      <td>1.465852</td>
    </tr>
    <tr>
      <th>1</th>
      <td>0.273766</td>
      <td>-0.688069</td>
    </tr>
  </tbody>
</table>


In [10]:
# Writing to file
df.to_html('../sample_data/data.html')

# 4. Excel files

## 4.1 Reading from Excel

In [11]:
df = pd.read_excel('../sample_data/data.xlsx', sheet_name='Sheet I', index_col=0)
df.head()

Unnamed: 0,A,B,C,D
2000-01-01,-1.662334,1.754375,2.277854,-1.22205
2000-01-02,-3.170328,2.6406,3.25295,0.325672
2000-01-03,-4.905516,3.187861,1.720954,-1.209723
2000-01-04,-5.109194,3.717737,2.365512,0.338022
2000-01-05,-5.705884,2.990343,1.962035,0.355969


In [12]:
# Reading in multiple sheets
with pd.ExcelFile('../sample_data/data.xlsx') as xls:
    df1 = pd.read_excel(xls, 'Sheet I')
    df2 = pd.read_excel(xls, 'Sheet II')

In [13]:
## Another way to read multiple sheets
dfx = pd.read_excel('../sample_data/data.xlsx', sheet_name =['Sheet I', 'Sheet II'],
                   index_col=0)
dfx['Sheet I'].head()

Unnamed: 0,A,B,C,D
2000-01-01,-1.662334,1.754375,2.277854,-1.22205
2000-01-02,-3.170328,2.6406,3.25295,0.325672
2000-01-03,-4.905516,3.187861,1.720954,-1.209723
2000-01-04,-5.109194,3.717737,2.365512,0.338022
2000-01-05,-5.705884,2.990343,1.962035,0.355969


## 4.2 Writing to Excel

In [14]:
# writing to Excel
df.to_excel('../sample_data/data.xlsx', sheet_name='Sheet1')

In [15]:
# To write to multiple sheets of the same file
with pd.ExcelWriter('../sample_data/data.xlsx') as file:
    df.to_excel(file, sheet_name='Sheet I')
    df['A'].to_excel(file, sheet_name='Sheet II')
    df['C'].to_excel(file, sheet_name='Sheet III')

# 5. OpenDocument Spreadsheets

In [16]:
# Only reading is currently supported
df = pd.read_excel('../sample_data/data.ods', engine='odf', index_col=0)
df.head()

Unnamed: 0,A,B,C,D
2000-01-01,-1.662334,1.754375,2.277854,-1.22205
2000-01-02,-3.170328,2.6406,3.25295,0.325672
2000-01-03,-4.905516,3.187861,1.720954,-1.209723
2000-01-04,-5.109194,3.717737,2.365512,0.338022
2000-01-05,-5.705884,2.990343,1.962035,0.355969


# 6. Binary Excel

In [17]:
# Only reading is currently supported
df = pd.read_excel('../sample_data/data.xlsb', engine='pyxlsb', index_col=0)
df.head()

Unnamed: 0,A,B,C,D
36526,-1.662334,1.754375,2.277854,-1.22205
36527,-3.170328,2.6406,3.25295,0.325672
36528,-4.905516,3.187861,1.720954,-1.209723
36529,-5.109194,3.717737,2.365512,0.338022
36530,-5.705884,2.990343,1.962035,0.355969


# 7. Clipboard

Copy this table, 

Y | x1 | x2 | x3
--- |--- | --- | ---
 0.5 | 7.2 | 5.1 | 0.9 
 1.1 | 8.4 | 3.7 | 2.5 
 
 then run the cell below.

In [18]:
pd.read_clipboard()

Unnamed: 0,Y,x1,x2,x3
0,0.5,7.2,5.1,0.9
1,1.1,8.4,3.7,2.5


In [19]:
df.to_clipboard()  # writes df's contents to clipboard. Can be pasted elsewhere.
pd.read_clipboard().head()

Unnamed: 0,A,B,C,D
36526,-1.662334,1.754375,2.277854,-1.22205
36527,-3.170328,2.6406,3.25295,0.325672
36528,-4.905516,3.187861,1.720954,-1.209723
36529,-5.109194,3.717737,2.365512,0.338022
36530,-5.705884,2.990343,1.962035,0.355969


# 8. Pickling

All pandas objects are equipped with `to_pickle` methods which use Python’s `cPickle` module to save data structures to disk using the `pickle` format.

In [20]:
# reading from disk
pd.read_pickle('../sample_data/data.pkl').head()

Unnamed: 0,A,B,C,D
36526,-1.662334,1.754375,2.277854,-1.22205
36527,-3.170328,2.6406,3.25295,0.325672
36528,-4.905516,3.187861,1.720954,-1.209723
36529,-5.109194,3.717737,2.365512,0.338022
36530,-5.705884,2.990343,1.962035,0.355969


In [21]:
# saving to disk
df.to_pickle('../sample_data/data.pkl')

> **Note:** Loading pickled data received from untrusted sources can be unsafe.

## 8.1 Compressed pickle files
The compression types of `gzip`, `bz2`, `xz` are supported for reading and writing. The `zip` file format only supports reading and must contain only one data file to be read.

In [22]:
df2 = pd.DataFrame({
        'A': np.random.randn(1000),
        'B': 'foo',
        'C': pd.date_range('20130101', periods=1000, freq='s')})
# inferring compression format from extension
df2.to_pickle("../sample_data/data.pkl.gz")

#'infer' is default, as above
df2.to_pickle("../sample_data/data.pkl.xz", compression="infer")

# explicitly setting compression format
df2.to_pickle("../sample_data/data.pkl.compress", compression="gzip") 

In [23]:
pd.read_pickle("../sample_data/data.pkl.xz").head()

Unnamed: 0,A,B,C
0,-1.356762,foo,2013-01-01 00:00:00
1,-0.226914,foo,2013-01-01 00:00:01
2,0.497019,foo,2013-01-01 00:00:02
3,1.721662,foo,2013-01-01 00:00:03
4,0.349897,foo,2013-01-01 00:00:04


# 9. HDF5

`HDFStore` is a dict-like object which reads and writes pandas using the high performance `HDF5` format using the excellent `PyTables` library.

Objects can be written to the file just like adding key-value pairs to a `dict`.

## 9.1 Writing to HDF5

In [24]:
store = pd.HDFStore('../sample_data/store.h5')
print(store)

<class 'pandas.io.pytables.HDFStore'>
File path: ../sample_data/store.h5



In [25]:
# Various methods to save data
store.put('A', df2.A)
store['BC'] = df2[['B','C']]
df2.to_hdf(store, 'df2')

store.close()

## 9.2 Reading from HDF5

In [26]:
store = pd.HDFStore('../sample_data/store.h5')

store['df2'].head()

Unnamed: 0,A,B,C
0,-1.356762,foo,2013-01-01 00:00:00
1,-0.226914,foo,2013-01-01 00:00:01
2,0.497019,foo,2013-01-01 00:00:02
3,1.721662,foo,2013-01-01 00:00:03
4,0.349897,foo,2013-01-01 00:00:04


In [27]:
store.df2.head()

Unnamed: 0,A,B,C
0,-1.356762,foo,2013-01-01 00:00:00
1,-0.226914,foo,2013-01-01 00:00:01
2,0.497019,foo,2013-01-01 00:00:02
3,1.721662,foo,2013-01-01 00:00:03
4,0.349897,foo,2013-01-01 00:00:04


In [28]:
store.get('df2').head()

Unnamed: 0,A,B,C
0,-1.356762,foo,2013-01-01 00:00:00
1,-0.226914,foo,2013-01-01 00:00:01
2,0.497019,foo,2013-01-01 00:00:02
3,1.721662,foo,2013-01-01 00:00:03
4,0.349897,foo,2013-01-01 00:00:04


In [29]:
pd.read_hdf(store, 'df2').head()

Unnamed: 0,A,B,C
0,-1.356762,foo,2013-01-01 00:00:00
1,-0.226914,foo,2013-01-01 00:00:01
2,0.497019,foo,2013-01-01 00:00:02
3,1.721662,foo,2013-01-01 00:00:03
4,0.349897,foo,2013-01-01 00:00:04


# 10. Feather

Feather is designed to faithfully serialize and de-serialize `DataFrames`, supporting all of the pandas dtypes, including extension dtypes such as `categorical` and `datetime` with tz.

In [30]:
df3 = pd.DataFrame({'a': list('abc'),
                     'b': list(range(1, 4)),
                     'c': np.arange(3, 6).astype('u1'),
                     'd': np.arange(4.0, 7.0, dtype='float64'),
                     'e': [True, False, True],
                     'f': pd.Categorical(list('abc')),
                     'g': pd.date_range('20130101', periods=3),
                     'h': pd.date_range('20130101', periods=3, tz='US/Eastern'),
                     'i': pd.date_range('20130101', periods=3, freq='ms')})
print(df3.dtypes)
df3.head()

a                        object
b                         int64
c                         uint8
d                       float64
e                          bool
f                      category
g                datetime64[ns]
h    datetime64[ns, US/Eastern]
i                datetime64[ns]
dtype: object


Unnamed: 0,a,b,c,d,e,f,g,h,i
0,a,1,3,4.0,True,a,2013-01-01,2013-01-01 00:00:00-05:00,2013-01-01 00:00:00.000
1,b,2,4,5.0,False,b,2013-01-02,2013-01-02 00:00:00-05:00,2013-01-01 00:00:00.001
2,c,3,5,6.0,True,c,2013-01-03,2013-01-03 00:00:00-05:00,2013-01-01 00:00:00.002


In [31]:
# writing to feather
df3.to_feather('../sample_data/data.feather') 

# reading from feather
pd.read_feather('../sample_data/data.feather').dtypes # data types preserved

a                        object
b                         int64
c                         uint8
d                       float64
e                          bool
f                      category
g                datetime64[ns]
h    datetime64[ns, US/Eastern]
i                datetime64[ns]
dtype: object

# 11. Paraquet

Parquet too is designed to faithfully serialize and de-serialize `DataFrame`s, supporting all of the pandas dtypes.

In [32]:
# writing to paraquet
df3.to_parquet('../sample_data/data.paraquet')

# reading from paraquet
df3 = pd.read_parquet('../sample_data/data.paraquet')
df3.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i
0,a,1,3,4.0,True,a,2013-01-01,2013-01-01 00:00:00-05:00,2013-01-01 00:00:00.000
1,b,2,4,5.0,False,b,2013-01-02,2013-01-02 00:00:00-05:00,2013-01-01 00:00:00.001
2,c,3,5,6.0,True,c,2013-01-03,2013-01-03 00:00:00-05:00,2013-01-01 00:00:00.002


In [33]:
df3.dtypes

a                        object
b                         int64
c                         uint8
d                       float64
e                          bool
f                      category
g                datetime64[ns]
h    datetime64[ns, US/Eastern]
i                datetime64[ns]
dtype: object

# 12. SQL

## 12.1 Writing to SQL

In [34]:
import sqlite3

# creating an SQLite database in RAM
# conn = sqlite3.connect(':memory:')

conn = sqlite3.connect('../sample_data/data.db')
users = pd.DataFrame({'name' : [f'User {i + i}' for i in range(10)],
                      'email': [f'user{i + 1}@email' for i in range(10)]})

# writing a DataFrame to an SQL database table
users.to_sql('users', con=conn, if_exists='replace')

## 12.2 Reading from SQL

In [35]:
# reading data from an SQLite database table
conn.execute("SELECT name FROM users LIMIT 5").fetchall()

[('User 0',), ('User 2',), ('User 4',), ('User 6',), ('User 8',)]

In [36]:
# creating a DataFrame from an SQL database table
pd.read_sql("SELECT name, email FROM users", con=conn)

Unnamed: 0,name,email
0,User 0,user1@email
1,User 2,user2@email
2,User 4,user3@email
3,User 6,user4@email
4,User 8,user5@email
5,User 10,user6@email
6,User 12,user7@email
7,User 14,user8@email
8,User 16,user9@email
9,User 18,user10@email


# 13. STATA

In [37]:
# writing to STATA
df.to_stata('../sample_data/stata.dta')

# reading from STATA
pd.read_stata('../sample_data/stata.dta').head()

Unnamed: 0,index,A,B,C,D
0,36526,-1.662334,1.754375,2.277854,-1.22205
1,36527,-3.170328,2.6406,3.25295,0.325672
2,36528,-4.905516,3.187861,1.720954,-1.209723
3,36529,-5.109194,3.717737,2.365512,0.338022
4,36530,-5.705884,2.990343,1.962035,0.355969


In [38]:
# Specifying a chunksize yields a StataReader instance that can be used as an iterator.
reader = pd.read_stata('../sample_data/stata.dta', chunksize=250)

for df in reader:
    print(df.shape)

(250, 5)
(250, 5)
(250, 5)
(250, 5)


# 14. SAS

Only reading from SAS is supported.

In [None]:
#  df = pd.read_sas('sas_data.sas7bdat')


#  def do_something(chunk):
#      pass
#
#  rdr = pd.read_sas('sas_xport.xpt', chunk=100000)
#  for chunk in rdr:
#      do_something(chunk)

# 15. SPSS
Only reading from SPSS files is supported.

In [None]:
#  df = pd.read_spss('spss_data.sav')