# Part I. ETL Pipeline for Pre-Processing the Files

## Import Python packages 

In [None]:
import pandas as pd
import cassandra
import re
import os
import glob
import numpy as np
import json
import csv
from src.database import (
    get_cs_cluster,
    get_cs_session,
    close_cs_session,
    shutdown_cs_cluster,
    insert_cs_rows,
    create_cs_keyspace,
    create_cs_table,
    drop_cs_table,
    drop_cs_keyspace,
    set_cs_keyspace,
)

## Creating list of filepaths to process original event csv data files

In [None]:
data_dir = os.getcwd() + "/event_data"
data_files = glob.glob(os.path.join(data_dir, "*"))
print(f"Number of data files: {len(data_files)}")

## Processing the files

In [None]:
full_data_rows_list = []
for f in data_files:
    with open(f, "r", encoding="utf8", newline="") as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)  # skip header
        for line in csvreader:
            full_data_rows_list.append(line)

print(
    f"Full data: \t#rows = {len(full_data_rows_list)}, "
    f"\tsize: {full_data_rows_list.__sizeof__() / 1000:.2f} kB (size of list in cache)"
)

# creating a smaller event data csv file
csv.register_dialect("myDialect", quoting=csv.QUOTE_ALL, skipinitialspace=True)
datafile_path = "event_datafile_new.csv"
with open(datafile_path, "w", encoding="utf8", newline="") as f:
    writer = csv.writer(f, dialect="myDialect")
    writer.writerow(
        [
            "artist",
            "firstName",
            "gender",
            "itemInSession",
            "lastName",
            "length",
            "level",
            "location",
            "sessionId",
            "song",
            "userId",
        ]
    )
    for row in full_data_rows_list:
        if row[0] == "":
            continue
        writer.writerow(
            (
                row[0],
                row[2],
                row[3],
                row[4],
                row[5],
                row[6],
                row[7],
                row[8],
                row[12],
                row[13],
                row[16],
            )
        )

with open(datafile_path, "r", encoding="utf8") as f:
    print(
        f"Filtered data: \t#rows -> {sum(1 for line in f)}, "
        f"\tsize: {os.path.getsize(datafile_path) / 1000:.2f} kB (size of file on disk)\n"
    )

# Part II. Complete the Apache Cassandra coding portion of your project. 

Now you are ready to work with the CSV file titled <font color=red>event_datafile_new.csv</font>, located within the Workspace directory.  The event_datafile_new.csv contains the following columns: 
- artist 
- firstName of user
- gender of user
- item number in session
- last name of user
- length of the song
- level (paid or free song)
- location of the user
- sessionId
- song title
- userId

The image below is a screenshot of what the denormalized data should appear like in the <font color=red>**event_datafile_new.csv**</font> after the code above is run:<br>

![](images/image_event_datafile_new.jpg)

## Apache Cassandra Cluster

### Creating a Cluster, keyspace and a session

In [None]:
cluster = get_cs_cluster()
session = get_cs_session(cluster)
keyspace_name = "my_keyspace"
drop_cs_keyspace(session, keyspace_name)  # drop any existing keyspace (nice for reruns)
create_cs_keyspace(session, keyspace_name)
set_cs_keyspace(session, keyspace_name)  # set the keyspace to the one we just created
tables = set()

## Queries
Now we need to create tables to run the following queries. Remember, with Apache Cassandra you model the database tables on the queries you want to run.

Create queries to ask the following three questions of the data

1. Give me the artist, song title and song's length in the music app history that was heard during  sessionId = 338, and itemInSession  = 4
2. Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182
3. Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'

### Query 1

#### Creating the table

In [None]:
## TO-DO: Query 1:  Give me the artist, song title and song's length in the music app history that was heard during \
## sessionId = 338, and itemInSession = 4

#### Inserting data

In [None]:
# We have provided part of the code to set up the CSV file. Please complete the Apache Cassandra code below#
file = 'event_datafile_new.csv'

with open(file, encoding = 'utf8') as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header
    for line in csvreader:
## TO-DO: Assign the INSERT statements into the `query` variable
        query = "<ENTER INSERT STATEMENT HERE>"
        query = query + "<ASSIGN VALUES HERE>"
        ## TO-DO: Assign which column element should be assigned for each column in the INSERT statement.
        ## For e.g., to INSERT artist_name and user first_name, you would change the code below to `line[0], line[1]`
        session.execute(query, (line[#], line[#]))

#### Verification of data

### Query 2

#### Creating the table

In [None]:
## TO-DO: Query 2: Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name)\
## for userid = 10, sessionid = 182

#### Inserting data

In [None]:
# We have provided part of the code to set up the CSV file. Please complete the Apache Cassandra code below#
file = 'event_datafile_new.csv'

with open(file, encoding = 'utf8') as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header
    for line in csvreader:
## TO-DO: Assign the INSERT statements into the `query` variable
        query = "<ENTER INSERT STATEMENT HERE>"
        query = query + "<ASSIGN VALUES HERE>"
        ## TO-DO: Assign which column element should be assigned for each column in the INSERT statement.
        ## For e.g., to INSERT artist_name and user first_name, you would change the code below to `line[0], line[1]`
        session.execute(query, (line[#], line[#]))

#### Verification of data

### Query 3

#### Creating the table

In [None]:
## TO-DO: Query 3: Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'

#### Inserting data

In [None]:
# We have provided part of the code to set up the CSV file. Please complete the Apache Cassandra code below#
file = 'event_datafile_new.csv'

with open(file, encoding = 'utf8') as f:
    csvreader = csv.reader(f)
    next(csvreader) # skip header
    for line in csvreader:
## TO-DO: Assign the INSERT statements into the `query` variable
        query = "<ENTER INSERT STATEMENT HERE>"
        query = query + "<ASSIGN VALUES HERE>"
        ## TO-DO: Assign which column element should be assigned for each column in the INSERT statement.
        ## For e.g., to INSERT artist_name and user first_name, you would change the code below to `line[0], line[1]`
        session.execute(query, (line[#], line[#]))

#### Verification of data

### COPY AND REPEAT THE ABOVE THREE CELLS FOR EACH OF THE THREE QUESTIONS

### Drop the tables before closing out the sessions

In [None]:
for table in tables:
    drop_cs_table(session, table)

### Close the session and cluster connectionÂ¶

In [None]:
close_cs_session(session)
shutdown_cs_cluster(cluster)