### Augmenting Pandas With SQLite
https://app.dataquest.io/m/166/augmenting-pandas-with-sqlite

In [None]:
import sqlite3
import pandas as pd
import numpy as np
conn = sqlite3.connect('Raw_data\moma.db')
moma_iter = pd.read_csv('Raw_data\moma.csv', chunksize=1000)
for chunk in moma_iter:
    chunk.to_sql("exhibitions", conn, if_exists='append', index=False)

Type	Description
NULL	The value is a NULL value
INTEGER	The value is a signed integer, stored in 1, 2, 3, 4, 6, or 8 bytes, depending on the magnitude of the value
REAL	The value is a floating point value, stored as an 8-byte IEEE floating point number
TEXT	The value is a text string, stored using the database encoding (UTF-8, UTF-16BE or UTF-16LE)
BLOB	The value is a blob of data, stored exactly as it was entered

In [None]:
# query moma.db and return the column types for the exhibitions table
results_df = pd.read_sql('PRAGMA table_info(exhibitions);', conn)
print(results_df.head())

In [None]:
results_df.info()

In [None]:
moma_iter = pd.read_csv('Raw_data\moma.csv', chunksize=1000)
for chunk in moma_iter:
    chunk['ExhibitionSortOrder'] = chunk['ExhibitionSortOrder'].astype('int16')
    chunk.to_sql("exhibitions", conn, if_exists='append', index=False)
results_df = pd.read_sql('PRAGMA table_info(exhibitions);', conn)
print(results_df.head())

In [None]:
# Query the exhibitions table in moma.db to return both the unique values in the ExhibitionID column and the counts
q = 'select exhibitionid, count(*) as counts from exhibitions group by exhibitionid order by counts desc;'
eid_counts = pd.read_sql(q, conn)
print(eid_counts.head())

In [None]:
# From the exhibitions table in moma.db, return the ExhibitionID column as a dataframe
q = 'select exhibitionid from exhibitions;'
eid_df = pd.read_sql(q, conn)
eid_pandas_counts = eid_df['ExhibitionID'].value_counts()
print(eid_pandas_counts.head())

In [None]:
# Read results in as dataframe chunks and then batch process the chunks
import collections as coll
q = 'select exhibitionid from exhibitions;'
chunk_iter = pd.read_sql(q, conn, chunksize=100)
dict = {} 
for chunk in chunk_iter:
    c = chunk['ExhibitionID'].value_counts()
    for inx, value in c.iteritems():
        if inx in dict:
            dict[inx] += value
        else:
            dict[inx] = value
s = coll.Counter(dict)
s.most_common(5)

### Guided Project: Analyzing Startup Fundraising Deals from Crunchbase
https://app.dataquest.io/m/167/guided-project%3A-analyzing-startup-fundraising-deals-from-crunchbase

In [1]:
import pandas as pd
pd.options.display.max_columns = 99
chunk_iter = pd.read_csv('Raw_data\crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

In [2]:
total_mem = 0
for chunk in chunk_iter:
    mem = chunk.memory_usage(deep=True).sum()/(1024*1024)
    print(mem)
    total_mem += mem
print('total_mem:', total_mem)

5.579195022583008
5.528186798095703
5.535004615783691
5.528155326843262
5.524299621582031
5.553397178649902
5.531391143798828
5.509613037109375
5.396082878112793
4.63945198059082
2.663668632507324
total_mem: 56.98844623565674


In [9]:
chunk_iter = pd.read_csv('Raw_data\crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
ls = []
for chunk in chunk_iter:
    c = chunk.isnull().sum()
    ls.append(c)
cc = pd.concat(ls)
unique_cc = cc.groupby(cc.index).sum()
unique_cc.sort_values()
print(unique_cc)

company_category_code       643
company_city                533
company_country_code          1
company_name                  1
company_permalink             1
company_region                1
company_state_code          492
funded_at                     3
funded_month                  3
funded_quarter                3
funded_year                   3
funding_round_type            3
investor_category_code    50427
investor_city             12480
investor_country_code     12001
investor_name                 2
investor_permalink            2
investor_region               2
investor_state_code       16809
raised_amount_usd          3599
dtype: int64


In [5]:
%history -p

>>> l=[1,2,3]
... for i in l:
...     i=i+2
...
>>> l=[1,2,3]
... for i in l:
...     i=i+2
... l
...
>>> l=[1,2,3]
... for i in l:
...     print(i+2)
...
>>> original_prices = [1.25, -9.45, 10.22, 3.78, -5.92, 1.16]
... prices = [i if i > 0 else 0 for i in original_prices]
... prices
...
>>> %history -p


### Querying SQLite from Python
https://app.dataquest.io/m/462/querying-sqlite-from-python/6/creating-a-cursor-and-running-a-query

In [10]:
import sqlite3
conn = sqlite3.connect("Raw_data\jobs.db")
cursor = conn.cursor()

query = "select Major from recent_grads;"
cursor.execute(query)
majors = cursor.fetchall()
print(majors[0:5])

[('PETROLEUM ENGINEERING',), ('MINING AND MINERAL ENGINEERING',), ('METALLURGICAL ENGINEERING',), ('NAVAL ARCHITECTURE AND MARINE ENGINEERING',), ('CHEMICAL ENGINEERING',)]


In [4]:
query = "select Major from recent_grads;"
majors = conn.execute(query).fetchall()
print(majors[0:5])

[('PETROLEUM ENGINEERING',), ('MINING AND MINERAL ENGINEERING',), ('METALLURGICAL ENGINEERING',), ('NAVAL ARCHITECTURE AND MARINE ENGINEERING',), ('CHEMICAL ENGINEERING',)]


In [7]:
query = "select Major,Major_category from recent_grads;"
cursor.execute(query)
cursor.fetchmany(5)

[('PETROLEUM ENGINEERING', 'Engineering'),
 ('MINING AND MINERAL ENGINEERING', 'Engineering'),
 ('METALLURGICAL ENGINEERING', 'Engineering'),
 ('NAVAL ARCHITECTURE AND MARINE ENGINEERING', 'Engineering'),
 ('CHEMICAL ENGINEERING', 'Engineering')]

In [12]:
query = "select Major from recent_grads order by Major desc;"
reverse_alphabetical = conn.cursor().execute(query).fetchall()
reverse_alphabetical[0:5]

[('ZOOLOGY',),
 ('VISUAL AND PERFORMING ARTS',),
 ('UNITED STATES HISTORY',),
 ('TREATMENT THERAPY PROFESSIONS',),
 ('TRANSPORTATION SCIENCES AND TECHNOLOGIES',)]

In [None]:
SELECT c.*, f.name country_name FROM facts f
INNER JOIN cities c ON c.facts_id = f.id
LIMIT 5;

In [None]:
SELECT f.name country, c.name capital_city FROM cities c
INNER JOIN facts f ON f.id = c.facts_id
WHERE c.capital = 1

In [None]:
# Using a join and a subquery, write a query that returns capital cities with populations of over 10 million ordered from largest to smallest
SELECT c.name capital_city, f.name country, c.population population
FROM facts f
INNER JOIN (
            SELECT * FROM cities
            WHERE capital = 1
            AND population > 10000000
           ) c ON c.facts_id = f.id
ORDER BY 3 DESC;