In [1]:
# You have the following dataset of chocolate bar ratings. You can assume you
# have this data in a table called InternationalChocolateRatings.

# Can you write a SQL query to summarize the BroadBeanOrigin for US manufactured 
# chocolate bars and provide the number of reviews, average rating, and average cocoa percent?


# Column Name in CSV	Column Name for SQL	Column Type	Short description
# Company	Company	string	Name of the company manufacturing the bar
# Specific Bean Origin or Bar Name	SpecificBeanOrigin	string	The specific geo-region of origin for the bar.
# REF value	REFValue	int	Value linked to when the review was entered in the database. Higher = more recent.
# Review Date	ReviewDate	int	Year of publication of the review.
# Cocoa Percent	CocoaPercent	double	Cocoa percentage (darkness) of the chocolate bar being reviewed.
# Company Location	CompanyLocation	string	Manufacturer base country.
# Rating	Rating	int	Expert rating for the bar.
# Bean Type	BeanType	string	The variety (breed) of bean used, if provided.
# Broad Bean Origin	BroadBeanOrigin	string	The broad geo-region of origin for the bean.

In [2]:
import pandas as pd
import numpy as np
import sqlite3

In [3]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)
 
    return conn



In [4]:
def create_table():
    '''Function to create tables
    Args:
        None
    Returns:
        None
    '''

    conn = create_connection('problem17.db')
    cur = conn.cursor()
    
    cur.execute('DROP TABLE IF EXISTS InternationalChocolateRatings')
    
    create_table = '''
    CREATE TABLE IF NOT EXISTS InternationalChocolateRatings(
    Company varchar,
    SpecificBeanOrigin varchar,
    REFValue int,
    ReviewDate int,
    CocoaPercent real,
    CompanyLocation varchar,
    Rating int,
    BeanType varchar,
    BroadBeanOrigin varchar);
    '''
    
    cur.execute(create_table)
    
    conn.commit()
    cur.close()
    conn.close()


In [13]:
def get_data():
    col_names = ['Company','SpecificBeanOrigin','REFValue','ReviewDate','CocoaPercent',
                 'CompanyLocation','Rating','BeanType','BroadBeanOrigin']
    df = pd.read_csv('flavors_of_cacao.csv', skiprows=1, names=col_names)
    df['CocoaPercent'] = df['CocoaPercent'].str[:-1].astype('float64')/100.0
    df['Rating'] = df['Rating'].astype('int64')
    df.head()
    
    return df
    
df = get_data()
# print(df.dtypes)
df['CompanyLocation'].unique()
df.head()

Unnamed: 0,Company,SpecificBeanOrigin,REFValue,ReviewDate,CocoaPercent,CompanyLocation,Rating,BeanType,BroadBeanOrigin
0,A. Morin,Agua Grande,1876,2016,0.63,France,3,,Sao Tome
1,A. Morin,Kpime,1676,2015,0.7,France,2,,Togo
2,A. Morin,Atsane,1676,2015,0.7,France,3,,Togo
3,A. Morin,Akata,1680,2015,0.7,France,3,,Togo
4,A. Morin,Quilla,1704,2015,0.7,France,3,,Peru


In [6]:
#df.columns

In [7]:
def insert_data():
    df = get_data()
    
    insert_query = '''
        INSERT INTO InternationalChocolateRatings(
        'Company', 'SpecificBeanOrigin', 'REFValue', 'ReviewDate',
       'CocoaPercent', 'CompanyLocation', 'Rating', 'BeanType',
       'BroadBeanOrigin') VALUES(?,?,?,?,?,?,?,?,?)
    '''
    
    conn = create_connection('problem17.db')
    cur = conn.cursor()
    
    for row in df.itertuples(index=False):
        #print(row)
        cur.execute(insert_query, row)
    
    conn.commit()
    cur.close()
    conn.close()


In [8]:
def prep_db():
    create_table()
    insert_data()
    
prep_db()


In [None]:
'Company', 'SpecificBeanOrigin', 'REFValue', 'ReviewDate',
       'CocoaPercent', 'CompanyLocation', 'Rating', 'BeanType',
       'BroadBeanOrigin'

In [18]:
#  Can you write a SQL query to summarize the BroadBeanOrigin for US manufactured 
# chocolate bars and provide the number of reviews, average rating, and average cocoa percent?
def summarize_bars():
    conn = create_connection('problem17.db')

    query = '''
    SELECT 
        COUNT(Rating) AS review_count,
        AVG(Rating) AS average_review,
        AVG(CocoaPercent) AS average_cocoa_percent
    FROM
        InternationalChocolateRatings
    WHERE
        CompanyLocation = 'U.S.A.'
    GROUP BY
        Company
    '''
    
    df = pd.read_sql(query, conn)
    conn.close()
    
    return df

    

In [19]:
review_df = summarize_bars()
review_df

Unnamed: 0,review_count,average_review,average_cocoa_percent
0,2,3.000000,0.700000
1,10,2.500000,0.620000
2,9,3.111111,0.700000
3,1,3.000000,0.750000
4,1,3.000000,0.750000
...,...,...,...
170,2,2.000000,0.700000
171,6,3.000000,0.683333
172,2,2.500000,0.655000
173,5,2.800000,0.714000
