In [1]:
import pandas as pd
from sodapy import Socrata

In [2]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cdc.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("fcqm-xrf4", limit=2000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

client.close()



In [3]:
Socrata.get?

[0;31mSignature:[0m [0mSocrata[0m[0;34m.[0m[0mget[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mdataset_identifier[0m[0;34m,[0m [0mcontent_type[0m[0;34m=[0m[0;34m'json'[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Read data from the requested resource. Options for content_type are json,
csv, and xml. Optionally, specify a keyword arg to filter results:

    select : the set of columns to be returned, defaults to *
    where : filters the rows to be returned, defaults to limit
    order : specifies the order of results
    group : column to group results on
    limit : max number of results to return, defaults to 1000
    offset : offset, used for paging. Defaults to 0
    q : performs a full text search for a value
    query : full SoQL query string, all as one parameter
    exclude_system_fields : defaults to true. If set to false, the
        response will include system fields (:id, :created_at, and
        :upd

In [4]:
results_df

Unnamed: 0,year,date,statefips,countyfips,ctfips,latitude,longitude,ds_pm_pred,ds_pm_stdd
0,2014,13JAN2014,6,6067,6067007421,38.71559,-121.3348,12.4809,7.1488
1,2014,13JAN2014,29,29189,29189212102,38.71566,-90.27569,5.6266,3.1003
2,2014,13JAN2014,24,24033,24033801308,38.71611,-77.00048,8.9377,5.367
3,2014,13JAN2014,51,51061,51061930205,38.71735,-77.8965,8.0598,5.6433
4,2014,13JAN2014,29,29135,29135385100,38.71752,-92.50101,5.8187,5.0234
...,...,...,...,...,...,...,...,...,...
1995,2014,13JAN2014,29,29095,29095007802,39.02446,-94.53214,4.3534,2.3999
1996,2014,13JAN2014,24,24033,24033807304,39.02504,-76.95937,11.9968,6.5554
1997,2014,13JAN2014,51,51107,51107611019,39.0252,-77.49299,10.8999,6.8784
1998,2014,13JAN2014,51,51107,51107611205,39.0254,-77.37886,11.31,6.8032


In [5]:
results_df.dtypes

year          object
date          object
statefips     object
countyfips    object
ctfips        object
latitude      object
longitude     object
ds_pm_pred    object
ds_pm_stdd    object
dtype: object

In [6]:
import numpy as np


def np_dtypes():
    dtypes = {
        'float': [np.float16, np.float32, np.float64],
        'int': [np.int8, np.int16, np.int32, np.int64],
    }
    
    df = pd.DataFrame()
    
    for k, v in dtypes.items():
        
        for dtype in v:
            if k=='float':
                dtype_info = np.finfo(dtype)

                df_dtype = pd.DataFrame(
                    {
                        'type': k,
                        'bits': dtype_info.bits,
                        'n_decimals': dtype_info.precision,
                        'min': dtype_info.min,
                        'max': dtype_info.max,
                    },
                    index=[k]
                )

            if k=='int':
                dtype_info = np.iinfo(dtype)

                df_dtype = pd.DataFrame(
                    {
                        'type': k,
                        'bits': dtype_info.bits,
                        'n_decimals': 0,
                        'min': dtype_info.min,
                        'max': dtype_info.max,
                    },
                    index=[k]
                )


            if df.shape[0]==0:
                df = df_dtype.copy()
            else:
                df = pd.concat([df, df_dtype])
    
    df = df.reset_index()
    df = df.drop(columns=['index'])
    df = df.set_index(['type', 'bits'])
    
    return df


np_dtypes()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_decimals,min,max
type,bits,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
float,16,3,-65504.0,65504.0
float,32,6,-3.4028230000000003e+38,3.4028230000000003e+38
float,64,15,-1.797693e+308,1.797693e+308
int,8,0,-128.0,127.0
int,16,0,-32768.0,32767.0
int,32,0,-2147484000.0,2147484000.0
int,64,0,-9.223372e+18,9.223372e+18


In [7]:
results_df.head(4)

Unnamed: 0,year,date,statefips,countyfips,ctfips,latitude,longitude,ds_pm_pred,ds_pm_stdd
0,2014,13JAN2014,6,6067,6067007421,38.71559,-121.3348,12.4809,7.1488
1,2014,13JAN2014,29,29189,29189212102,38.71566,-90.27569,5.6266,3.1003
2,2014,13JAN2014,24,24033,24033801308,38.71611,-77.00048,8.9377,5.367
3,2014,13JAN2014,51,51061,51061930205,38.71735,-77.8965,8.0598,5.6433


In [8]:
# s = pd.to_datetime(results_df['date'], '%d%b%Y')
s = results_df['date']
set(s)

{'13JAN2014'}

In [9]:
t = '15NOV2013'
t.title()

'15Nov2013'

In [10]:
pd.to_datetime(t.title(), format='%d%b%Y')

Timestamp('2013-11-15 00:00:00')

In [11]:
pd.to_datetime(t, format='%d%b%Y')

Timestamp('2013-11-15 00:00:00')

In [12]:
df_dtypes = {
    'year': 'int16', # can be dropped
    'date': 'category', 
    'statefips': 'int8', 
    'countyfips': 'int32', 
    'ctfips': 'int64', 
    'latitude': 'float32',
    'longitude': 'float32', 
    'ds_pm_pred': 'float32', 
    'ds_pm_stdd': 'float32',
}

for k, v in df_dtypes.items():
    if k == 'date':
        # continue
        results_df[k] = pd.to_datetime(results_df[k], format='%d%b%Y')
    else:
        results_df[k] = results_df[k].astype(v)

results_df.dtypes

year                   int16
date          datetime64[ns]
statefips               int8
countyfips             int32
ctfips                 int64
latitude             float32
longitude            float32
ds_pm_pred           float32
ds_pm_stdd           float32
dtype: object

In [1]:
client.close()

NameError: name 'client' is not defined