In [123]:
import pandas as pd
import numpy as np
import sys
import csv
import json
from lxml import objectify
import requests

In [2]:
df = pd.read_csv('examples/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [3]:
# Using read_tables method.
pd.read_table('examples/ex1.csv',delimiter=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
pd.read_table('examples/ex1.csv')

Unnamed: 0,"a,b,c,d,message"
0,"1,2,3,4,hello"
1,"5,6,7,8,world"
2,"9,10,11,12,foo"


In [5]:
pd.read_csv('examples/ex2.csv',header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
pd.read_csv('examples/ex2.csv',names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


Suppose you wanted the message column to be the index of the returned DataFrame.You can either indicate you want the column at index 4 or named 'message' using the index_col argument.

In [7]:
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('examples/ex2.csv', names=names, index_col=names[len(names)-1])

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In the event that you want to form a hierarchical index from multiple columns, pass a
list of column numbers or names.

In [8]:
parsed = pd.read_csv('examples/csv_mindex.csv',index_col=['key1', 'key2'])
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [9]:
list(open('examples/ex3.txt'))

["' A B C\\n'\n",
 "'aaa -0.264438 -1.026059 -0.619500\\n'\n",
 "'bbb 0.927272 0.302904 -0.032399\\n'\n",
 "'ccc -0.264273 -0.386314 -0.217601\\n'\n",
 "'ddd -0.871858 -0.348382 1.100491\\n'"]

In [10]:
 #\s - Matches any whitespace character; this is equivalent to the set 
result = pd.read_table('examples/ex3.txt', sep='\s+')
result

Unnamed: 0,',A,B,C\n'
0,'aaa,-0.264438,-1.026059,-0.619500\n'
1,'bbb,0.927272,0.302904,-0.032399\n'
2,'ccc,-0.264273,-0.386314,-0.217601\n'
3,'ddd,-0.871858,-0.348382,1.100491\n'


In [11]:
pd.read_csv('examples/ex4.csv',skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### Sentinal values in csv file.

In [12]:
result = pd.read_csv('examples/ex5.csv')
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [13]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


### The na_values option can take either a list or set of strings to consider missing values.

In [14]:
result = pd.read_csv('examples/ex5.csv',na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


### Different NA sentinels can be specified for each column in a dict.

In [15]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two','three']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,,9,10,11.0,12,


# Reading Text Files in Pieces

In [16]:
pd.options.display.max_rows = 10

In [17]:
result = pd.read_csv('data-files/data.csv')
result

Unnamed: 0,Student Code,Degree,Student Name,Mid,Quiz 1,Quiz 2,Best of Quizzes,Assignment 1,Assignment 2,Best of Assignments,Total Sessional (50),Final (50),Total (100),Grade
0,022-14-19987,BS(CS),Abdul Basit,28,8.0,3.0,8,7.0,9.0,9,45,25.0,70,B
1,022-14-110233,BS(CS),Adeel Ahmed,17,,5.0,5,8.0,10.0,10,32,18.0,50,F
2,022-14-110585,BS(CS),Afrah Zareen,18,5.0,2.0,5,8.0,10.0,10,33,30.0,63,C
3,022-14-19718,BS(CS),Ahmed Ali Raza,14,7.0,2.0,7,,2.0,2,23,23.0,46,F
4,022-14-110648,BS(CS),Ahsan Ali Vohra,27,7.0,6.0,7,7.0,9.0,9,43,34.0,77,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,022-14-110451,BS(CS),Syed Faizan Uddin,28,9.0,4.0,9,6.0,8.0,8,45,34.0,79,B
44,022-14-110589,BS(CS),Syed Sohaib,25,7.0,5.0,7,9.0,,9,41,22.0,63,C
45,022-14-110400,BS(CS),Syeda Sabika Raza,27,9.0,6.0,9,9.0,,9,45,35.0,80,A
46,022-14-19911,BS(CS),Usman Khan,25,8.0,5.0,8,8.0,10.0,10,43,22.0,65,C


In [18]:
# Loading dataset.
babies_data = pd.read_csv("data-files/yob2010.csv",names=['Name','Gender','Reg No'])
babies_data

Unnamed: 0,Name,Gender,Reg No
0,Isabella,F,22731
1,Sophia,F,20477
2,Emma,F,17179
3,Olivia,F,16860
4,Ava,F,15300
...,...,...,...
33833,Zymaire,M,5
33834,Zyonne,M,5
33835,Zyquarius,M,5
33836,Zyran,M,5


In [19]:
# Reading a file in pieces.
babies_data_chunk = pd.read_csv("data-files/yob2010.csv",names=['Name','Gender','Reg No'],chunksize=1000)
babies_data_chunk

<pandas.io.parsers.TextFileReader at 0x115bd52d0>

### Working with chunks.

In [20]:
tot = pd.Series([])
for piece in babies_data_chunk:
#     print(piece['Name'].value_counts())
    tot = tot.add(piece['Name'].value_counts(), fill_value=0)
tot.sort_values(ascending=False)    

Michele      2.0
Amaziah      2.0
Amaurie      2.0
Aven         2.0
Gabryel      2.0
            ... 
Mariea       1.0
Mariel       1.0
Mariela      1.0
Marielena    1.0
Aaban        1.0
Length: 31432, dtype: float64

In [21]:
tot[:10]

Aaban       1.0
Aadam       1.0
Aadan       1.0
Aaden       1.0
Aadhav      1.0
Aadhavan    1.0
Aadhya      1.0
Aadi        1.0
Aadil       1.0
Aadin       1.0
dtype: float64

In [22]:
'''TextParser is also equipped with a get_chunk method that enables you to read
pieces of an arbitrary size.'''
# babies_data_chunk.get_chunk(size=3)

'TextParser is also equipped with a get_chunk method that enables you to read\npieces of an arbitrary size.'

# Writing Data to Text Format

In [23]:
data = pd.read_csv('examples/ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [24]:
data.isnull()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


#### Using DataFrame’s to_csv method, we can write the data out to a comma-separated file.

In [25]:
data.to_csv('examples/out.csv')
# Checking generated csv file.
out_csv = pd.read_csv('examples/out.csv')
out_csv

Unnamed: 0.1,Unnamed: 0,something,a,b,c,d,message
0,0,one,1,2,3.0,4,
1,1,two,5,6,,8,world
2,2,three,9,10,11.0,12,foo


In [26]:
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [27]:
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [28]:
#With no other options specified, both the row and column labels are written. Both of these can be disabled.
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [29]:
#You can also write only a subset of the columns, and in an order of your choosing.
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


In [30]:
### Series also have a to_csv method.
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
# print(dates)
ts.to_csv('examples/tseries.csv')

  """


# Working with Delimited Formats

In [31]:
f = open('examples/ex7.csv')
reader = csv.reader(f)
reader

<_csv.reader at 0x115aefdd0>

In [32]:
#Iterating over reader csv.
for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [33]:
with open('examples/ex7.csv') as f:
     lines = list(csv.reader(f))
print(lines)        

[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3']]


In [34]:
header, values = lines[0], lines[1:]
print(header)
print(values)

['a', 'b', 'c']
[['1', '2', '3'], ['1', '2', '3']]


In [35]:
# Performing dict comprehension.
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [36]:
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL
reader = csv.reader(data_dict, dialect=my_dialect)
reader

<_csv.reader at 0x115badad0>

In [37]:
with open('mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

# JSON Data

In [38]:
obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
{"name": "Katie", "age": 38,
"pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

In [42]:
result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [43]:
type(result)

dict

In [44]:
# Converting back to JSON.
asjson = json.dumps(result)
asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}'

In [45]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,30
1,Katie,38


In [46]:
siblings_v2 = pd.DataFrame(result['siblings'], columns=['name', 'age', 'pets'])
siblings_v2

Unnamed: 0,name,age,pets
0,Scott,30,"[Zeus, Zuko]"
1,Katie,38,"[Sixes, Stache, Cisco]"


### Reading from JSON File.

In [49]:
data = pd.read_json('examples/example.json') #List of dictionaries was passed.
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [None]:
data.to_json

In [50]:
print(data.to_json())

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}


In [53]:
print(data.to_json(orient='records'))

[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


In [54]:
print(data.to_json(orient='columns'))

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}


In [55]:
print(data.to_json(orient='index'))

{"0":{"a":1,"b":2,"c":3},"1":{"a":4,"b":5,"c":6},"2":{"a":7,"b":8,"c":9}}


In [56]:
print(data.to_json(orient='values'))

[[1,2,3],[4,5,6],[7,8,9]]


In [57]:
print(data.to_json(orient='table'))

{"schema": {"fields":[{"name":"index","type":"integer"},{"name":"a","type":"integer"},{"name":"b","type":"integer"},{"name":"c","type":"integer"}],"primaryKey":["index"],"pandas_version":"0.20.0"}, "data": [{"index":0,"a":1,"b":2,"c":3},{"index":1,"a":4,"b":5,"c":6},{"index":2,"a":7,"b":8,"c":9}]}


# XML and HTML: Web Scraping

In [58]:
table = pd.read_html('examples/fdic_failed_bank_list.html')
table

[                             Bank Name             City  ST   CERT  \
 0                          Allied Bank         Mulberry  AR     91   
 1         The Woodbury Banking Company         Woodbury  GA  11297   
 2               First CornerStone Bank  King of Prussia  PA  35312   
 3                   Trust Company Bank          Memphis  TN   9956   
 4           North Milwaukee State Bank        Milwaukee  WI  20364   
 ..                                 ...              ...  ..    ...   
 542                 Superior Bank, FSB         Hinsdale  IL  32646   
 543                Malta National Bank            Malta  OH   6629   
 544    First Alliance Bank & Trust Co.       Manchester  NH  34264   
 545  National State Bank of Metropolis       Metropolis  IL   3815   
 546                   Bank of Honolulu         Honolulu  HI  21029   
 
                    Acquiring Institution        Closing Date  \
 0                           Today's Bank  September 23, 2016   
 1              

In [59]:
# Returns
# -------
# dfs : list of DataFrames
len(table)

1

In [60]:
failures = table[0]
failures

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"
...,...,...,...,...,...,...,...
542,"Superior Bank, FSB",Hinsdale,IL,32646,"Superior Federal, FSB","July 27, 2001","August 19, 2014"
543,Malta National Bank,Malta,OH,6629,North Valley Bank,"May 3, 2001","November 18, 2002"
544,First Alliance Bank & Trust Co.,Manchester,NH,34264,Southern New Hampshire Bank & Trust,"February 2, 2001","February 18, 2003"
545,National State Bank of Metropolis,Metropolis,IL,3815,Banterra Bank of Marion,"December 14, 2000","March 17, 2005"


In [61]:
failures.head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


In [62]:
close_timestamps = pd.to_datetime(failures['Closing Date'])

In [63]:
close_timestamps

0     2016-09-23
1     2016-08-19
2     2016-05-06
3     2016-04-29
4     2016-03-11
         ...    
542   2001-07-27
543   2001-05-03
544   2001-02-02
545   2000-12-14
546   2000-10-13
Name: Closing Date, Length: 547, dtype: datetime64[ns]

### Parsing XML with lxml.objectify

In [76]:
path = 'examples/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()
root

<Element INDICATOR at 0x118495dc0>

In [74]:
data = []
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ','DESIRED_CHANGE', 'DECIMAL_PLACES']

In [77]:
for elt in root:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
        data.append(el_data)

In [79]:
df_xml = pd.DataFrame(data)
df_xml

Unnamed: 0,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,INDICATOR_UNIT,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
1,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
2,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
3,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
4,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
7,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
8,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
9,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
10,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,


In [80]:
df_xml.head()

Unnamed: 0,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,INDICATOR_UNIT,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
1,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
2,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
3,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,
4,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,12,Service Indicators,M,%,97.0,,97.0,


XML data can get much more complicated than this example. Each tag can have
metadata, too. Consider an HTML link tag, which is also valid XML

In [84]:
from io import StringIO
tag = '<a href="http://www.google.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()

In [85]:
root.text

'Google'

In [86]:
root.get('href')

'http://www.google.com'

In [87]:
root

<Element a at 0x1187397d0>

# Binary Data Formats

In [88]:
frame = pd.read_csv('examples/ex1.csv')
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [90]:
frame.to_pickle('examples/to_pickle_dest/frame_pickle')

In [91]:
# Returns
# -------
# unpickled : same type as object stored in file
pd.read_pickle('examples/to_pickle_dest/frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## Using HDF5 Format

In [92]:
frame = pd.DataFrame({'a': np.random.randn(100)})
frame

Unnamed: 0,a
0,1.394936
1,0.242719
2,-1.912851
3,-1.156873
4,-0.281443
...,...
95,-1.554981
96,-0.023633
97,-1.222536
98,-1.201213


In [109]:
store = pd.HDFStore('mydata.h5')
type(store)

pandas.io.pytables.HDFStore

In [94]:
store['obj1'] = frame
store['obj1_col'] = frame['a']

In [95]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

### Objects contained in the HDF5 file can then be retrieved with the same dict-like API.

In [96]:
store['obj1'] #Getting DataFrame.

Unnamed: 0,a
0,1.394936
1,0.242719
2,-1.912851
3,-1.156873
4,-0.281443
...,...
95,-1.554981
96,-0.023633
97,-1.222536
98,-1.201213


In [97]:
store['obj1_col'] #Getting Series.

0     1.394936
1     0.242719
2    -1.912851
3    -1.156873
4    -0.281443
        ...   
95   -1.554981
96   -0.023633
97   -1.222536
98   -1.201213
99    0.868085
Name: a, Length: 100, dtype: float64

In [98]:
store.put('obj2', frame, format='table')

In [99]:
frame2 = pd.DataFrame({'a': np.random.randn(10)})
store.put('obj3',frame2, format='fixed')

In [100]:
store['obj3']

Unnamed: 0,a
0,-0.726839
1,0.602447
2,-0.108572
3,-0.846637
4,0.50158
5,-0.81332
6,0.294383
7,-1.06136
8,1.517113
9,0.697157


In [101]:
store.get('obj2')

Unnamed: 0,a
0,1.394936
1,0.242719
2,-1.912851
3,-1.156873
4,-0.281443
...,...
95,-1.554981
96,-0.023633
97,-1.222536
98,-1.201213


In [102]:
store.select('obj2', where=['index >= 10 and index <= 15'])

Unnamed: 0,a
10,-0.964477
11,0.959474
12,-0.121422
13,0.422363
14,0.143847
15,-0.676105


In [104]:
'''
TypeError: cannot pass a where specification when reading from a Fixed format store. 
this store must be selected in its entirety
'''
# store.select('obj3', where=['index >= 10 and index <= 15'])

'\nTypeError: cannot pass a where specification when reading from a Fixed format store. \nthis store must be selected in its entirety\n'

In [105]:
store.close()

The put is an explicit version of the store['obj2'] = frame method but allows us to
set other options like the storage format.
The pandas.read_hdf function gives you a shortcut to these tools.

In [107]:
frame.to_hdf('mydata.h5', 'obj4', format='table')

In [108]:
pd.read_hdf('mydata.h5', 'obj4', where=['index < 5'])

Unnamed: 0,a
0,1.394936
1,0.242719
2,-1.912851
3,-1.156873
4,-0.281443


## Reading Microsoft Excel Files

In [111]:
# xlsx = pd.ExcelFile('examples/ex1.xlsx')
# xlsx

<pandas.io.excel._base.ExcelFile at 0x1161ceb50>

In [115]:
pd.read_excel(xlsx,'Sheet1')

Unnamed: 0,PIAIC AIC ONSITE & ONLINE TEST SCHEDULE,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,,,,,,
1,Roll #,Type,Day,Date,Time,Location,Quiz Description
2,AIC000054,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
3,AIC000088,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
4,AIC000409,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
...,...,...,...,...,...,...,...
3336,AIC008147,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3337,AIC005193,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3338,AIC007356,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3339,AIC010653,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)


In [116]:
# Just using read_excel method.
frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
frame

Unnamed: 0,PIAIC AIC ONSITE & ONLINE TEST SCHEDULE,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,,,,,,
1,Roll #,Type,Day,Date,Time,Location,Quiz Description
2,AIC000054,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
3,AIC000088,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
4,AIC000409,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
...,...,...,...,...,...,...,...
3336,AIC008147,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3337,AIC005193,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3338,AIC007356,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3339,AIC010653,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)


To write pandas data to Excel format, you must first create an ExcelWriter, then
write data to it using pandas objects’ to_excel method.

In [119]:
writer = pd.ExcelWriter('examples/ex2.xlsx') 
frame.to_excel(writer,'Sheet1')
writer.save()

In [120]:
pd.read_excel('examples/ex2.xlsx')

Unnamed: 0.1,Unnamed: 0,PIAIC AIC ONSITE & ONLINE TEST SCHEDULE,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,0,,,,,,,
1,1,Roll #,Type,Day,Date,Time,Location,Quiz Description
2,2,AIC000054,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
3,3,AIC000088,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
4,4,AIC000409,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
...,...,...,...,...,...,...,...,...
3336,3336,AIC008147,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3337,3337,AIC005193,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3338,3338,AIC007356,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3339,3339,AIC010653,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)


In [121]:
# Writing excel just using file path.
frame.to_excel('examples/ex2.xlsx')

In [122]:
pd.read_excel('examples/ex2.xlsx')

Unnamed: 0.1,Unnamed: 0,PIAIC AIC ONSITE & ONLINE TEST SCHEDULE,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,0,,,,,,,
1,1,Roll #,Type,Day,Date,Time,Location,Quiz Description
2,2,AIC000054,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
3,3,AIC000088,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
4,4,AIC000409,Onsite,Monday,30th Dec,4pm - 5pm,Saylani Gulshan,Pandas Test (AI Q2 Quiz 2)
...,...,...,...,...,...,...,...,...
3336,3336,AIC008147,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3337,3337,AIC005193,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3338,3338,AIC007356,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)
3339,3339,AIC010653,Online,Friday,3rd Jan,8pm - 9pm,Saylani Bahadurabad,Pandas Test (AI Q2 Quiz 2)


# Interacting with Web APIs

In [126]:
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp

<Response [200]>

In [127]:
data = resp.json()
data

[{'url': 'https://api.github.com/repos/pandas-dev/pandas/issues/30587',
  'repository_url': 'https://api.github.com/repos/pandas-dev/pandas',
  'labels_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/30587/labels{/name}',
  'comments_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/30587/comments',
  'events_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/30587/events',
  'html_url': 'https://github.com/pandas-dev/pandas/pull/30587',
  'id': 544240527,
  'node_id': 'MDExOlB1bGxSZXF1ZXN0MzU4Mzk4OTAz',
  'number': 30587,
  'title': 'REF: share code between DatetimeIndex and TimedeltaIndex',
  'user': {'login': 'jbrockmendel',
   'id': 8078968,
   'node_id': 'MDQ6VXNlcjgwNzg5Njg=',
   'avatar_url': 'https://avatars1.githubusercontent.com/u/8078968?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/jbrockmendel',
   'html_url': 'https://github.com/jbrockmendel',
   'followers_url': 'https://api.github.com/users/jbrockmendel/followers',

In [128]:
issues = pd.DataFrame(data, columns=['number', 'title', 'labels', 'state'])
issues

Unnamed: 0,number,title,labels,state
0,30587,REF: share code between DatetimeIndex and Time...,[],open
1,30586,REF: separate casting out of Index.__new__,[],open
2,30585,BUG: Disable parallel cythonize on Windows (GH...,"[{'id': 57186974, 'node_id': 'MDU6TGFiZWw1NzE4...",open
3,30584,ENH: Add dropna in groupby to allow NaN in keys,[],open
4,30583,WIP: Restructuring all builds (NOT TO MERGE),"[{'id': 48070600, 'node_id': 'MDU6TGFiZWw0ODA3...",open
...,...,...,...,...
25,30533,Inconsistent index in result of groupby apply,[],open
26,30531,Performance of maybe_box_datetimelike #30520,"[{'id': 8935311, 'node_id': 'MDU6TGFiZWw4OTM1M...",open
27,30526,BUG: pct_change wrong result when there are du...,[],open
28,30520,Performance issue with pandas/core/common.py -...,"[{'id': 8935311, 'node_id': 'MDU6TGFiZWw4OTM1M...",open


# Interacting with Databases

In [129]:
import sqlite3

In [130]:
query = ''' 
CREATE TABLE test(a VARCHAR(20),b VARCHAR(20),c REAL, d INTEGER);
'''

In [131]:
#Now connection creation.
con = sqlite3.connect('mydata.sqlite')

In [132]:
con.execute(query)

<sqlite3.Cursor at 0x11a776490>

In [133]:
con.commit()

### Insert a few rows of data

In [135]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
insert_statement = '''INSERT INTO test VALUES(?, ?, ?, ?)'''
con.executemany(insert_statement, data)

<sqlite3.Cursor at 0x11a935b20>

In [136]:
con.commit()

In [137]:
# Selecting rows from database.
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [139]:
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [140]:
import sqlalchemy as sqla 

In [144]:
db = sqla.create_engine('sqlite:///mydata.sqlite')

In [145]:
pd.read_sql('select * from test', db)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [None]:
db.execute