#### Import Python packages 

In [1]:
# Import Python packages 
import pandas as pd
import cassandra

In [2]:
from etl import preprocess

In [3]:
# Import Queries
from nosql_queries import session_item_create, user_session_create, song_user_create
from nosql_queries import session_item_insert, user_session_insert, song_user_insert
from nosql_queries import session_item_select, user_session_select, song_user_select
from nosql_queries import drop_table_queries

In [4]:
from utils import create_cluster_keyspace, execute_query, insert_from_df, result_as_df

### Run ETL Pipeline for Pre-Processing the Files

In [5]:
df = preprocess()
df.head()

/Users/keneudeh/Documents/Projects/udacity-data-engineer-nanodegree/0-Data-Modeling/Projects/sparkify-data-etl-cassandra
Num lines: 6821


Unnamed: 0,artist,firstName,gender,itemInSession,lastName,length,level,location,sessionId,song,userId
0,Harmonia,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",583,Sehr kosmisch,26
1,The Prodigy,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",583,The Big Gundown,26
2,Train,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",583,Marry Me,26
3,Sony Wonder,Samuel,M,0,Gonzalez,218.06975,free,"Houston-The Woodlands-Sugar Land, TX",597,Blackbird,61
4,Van Halen,Tegan,F,2,Levine,289.38404,paid,"Portland-South Portland, ME",602,Best Of Both Worlds (Remastered Album Version),80


### Set up DB

In [6]:
cluster, session = create_cluster_keyspace()

### Create Collections to answer queries

#### 1. Give me the artist, song title and song's length in the music app history that was heard during  sessionId = 338, and itemInSession  = 4

In [7]:
# CREATE collection
execute_query(session, session_item_create)

<cassandra.cluster.ResultSet at 0x11ca929d0>

In [8]:
# INSERT values into collection
insert_from_df(session, df, ['sessionId', 'itemInSession', 'artist', 'song', 'length'], session_item_insert)

In [9]:
# SELECT to verify that the data have been inserted into table
res = execute_query(session, session_item_select)
result_as_df(res, columns=['artist_name', 'song_title', 'song_length'])

Unnamed: 0,artist_name,song_title,song_length
0,Faithless,Music Matters (Mark Knight Dub),495.3073


#### 2. Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182

In [10]:
# CREATE collection
execute_query(session, user_session_create)

<cassandra.cluster.ResultSet at 0x119f976d0>

In [11]:
# INSERT values into collection
insert_from_df(session, df, ['userId', 'sessionId', 'itemInSession', 'artist', 'firstName', 'lastName', 'song'], user_session_insert)

Do a SELECT to verify that the data have been inserted into each table

In [12]:
# SELECT to verify that the data have been inserted into table
res = execute_query(session, user_session_select)
result_as_df(res, columns=['artist_name', 'song_title', 'first_name', 'last_name'])


Unnamed: 0,artist_name,song_title,first_name,last_name
0,Down To The Bone,Keep On Keepin' On,Sylvie,Cruz
1,Three Drives,Greece 2000,Sylvie,Cruz
2,Sebastien Tellier,Kilometer,Sylvie,Cruz
3,Lonnie Gordon,Catch You Baby (Steve Pitron & Max Sanna Radio...,Sylvie,Cruz


#### 3. Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'

In [13]:
# CREATE collection
execute_query(session, song_user_create)            

<cassandra.cluster.ResultSet at 0x11bbb4650>

In [14]:
# INSERT values into collection
insert_from_df(session, df, ['song', 'userId', 'firstName', 'lastName'], song_user_insert)

Do a SELECT to verify that the data have been inserted into each table

In [15]:
# SELECT to verify that the data have been inserted into table
res = execute_query(session, song_user_select)
result_as_df(res, columns=['song_title', 'first_name', 'last_name'])

Unnamed: 0,song_title,first_name,last_name
0,All Hands Against His Own,Jacqueline,Lynch
1,All Hands Against His Own,Tegan,Levine
2,All Hands Against His Own,Sara,Johnson


### Drop the tables before closing out the sessions

In [16]:
for query in drop_table_queries:
    execute_query(session, query)

### Close the session and cluster connection¶

In [17]:
session.shutdown()
cluster.shutdown()