# Part I. ETL Pipeline for Pre-Processing the Files

## Import Python packages 

In [1]:
import pandas as pd
import os
import glob
import csv
import time
from src.database import (
    get_cs_cluster,
    get_cs_session,
    close_cs_session,
    shutdown_cs_cluster,
    insert_cs_rows,
    create_cs_keyspace,
    create_cs_table,
    drop_cs_table,
    drop_cs_keyspace,
    set_cs_keyspace,
)

## Creating list of filepaths to process original event csv data files

In [2]:
data_dir = os.getcwd() + "/event_data"
data_files = glob.glob(os.path.join(data_dir, "*"))
print(f"Number of data files: {len(data_files)}")

Number of data files: 30


## Processing the files

In [3]:
full_data_rows_list = []
for f in data_files:
    with open(f, "r", encoding="utf8", newline="") as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)  # skip header
        for line in csvreader:
            full_data_rows_list.append(line)

print(
    f"Full data: \t#rows = {len(full_data_rows_list)}, "
    f"\tsize = {full_data_rows_list.__sizeof__() / 1000:.2f} kB (size of list in cache)"
)

# creating a smaller event data csv file
csv.register_dialect("myDialect", quoting=csv.QUOTE_ALL, skipinitialspace=True)
datafile_path = "event_datafile_new.csv"
datafile_header = [
    "artistName",
    "userFirstName",
    "userGender",
    "itemInSession",
    "userLastName",
    "songLength",
    "level",
    "location",
    "sessionId",
    "songTitle",
    "userId",
]
datafile_header_map = {name: idx for idx, name in enumerate(datafile_header)}
with open(datafile_path, "w", encoding="utf8", newline="") as f:
    writer = csv.writer(f, dialect="myDialect")
    writer.writerow(datafile_header)
    for row in full_data_rows_list:
        if row[0] == "":
            continue
        writer.writerow(
            (
                row[0],
                row[2],
                row[3],
                row[4],
                row[5],
                row[6],
                row[7],
                row[8],
                row[12],
                row[13],
                row[16],
            )
        )

with open(datafile_path, "r", encoding="utf8") as f:
    num_filtered_rows = sum(1 for line in f) - 1  # subtract 1 for header
    print(
        f"Filtered data: \t#rows = {num_filtered_rows}, "
        f"\tsize = {os.path.getsize(datafile_path) / 1000:.2f} kB (size of file on disk)\n"
    )

Full data: 	#rows = 8056, 	size = 67.21 kB (size of list in cache)
Filtered data: 	#rows = 6820, 	size = 854.14 kB (size of file on disk)



# Part II. Complete the Apache Cassandra coding portion of your project.

Now you are ready to work with the CSV file titled <font color=red>event_datafile_new.csv</font>, located within the Workspace directory.  The event_datafile_new.csv contains the following columns: 
- artist 
- firstName of user
- gender of user
- item number in session
- last name of user
- length of the song
- level (paid or free song)
- location of the user
- sessionId
- song title
- userId

The image below is a screenshot of what the denormalized data should appear like in the <font color=red>**event_datafile_new.csv**</font> after the code above is run:<br>

![](images/image_event_datafile_new.jpg)

## Apache Cassandra Cluster

### Creating a Cluster, keyspace and a session

In [4]:
cluster = get_cs_cluster()
session = get_cs_session(cluster)
keyspace_name = "my_keyspace"
drop_cs_keyspace(session, keyspace_name)  # drop any existing keyspace (nice for reruns)
create_cs_keyspace(session, keyspace_name)
set_cs_keyspace(session, keyspace_name)  # set the keyspace to the one we just created
tables = set()

## Queries
Now we need to create tables to run the following queries. Remember, with Apache Cassandra you model the database tables on the queries you want to run.

### Query 1

**Assignment**: Give the artist, song title and song's length in the music app history that was heard during  sessionId = 338, and itemInSession = 4

**Query**:
```sql
SELECT artistName, songTitle, songLength 
FROM music_history_by_session
WHERE sessionId = 338 AND itemInSession = 4;
```

**Why this table fits the query**
- Partition on `sessionId` keeps all rows for a session together.
- Clustering on `itemInSession` preserves playback order.
- Adding `userId` to the clustering key prevents collisions when `sessionId` is not globally unique. Note: I assumed `sessionId` is not globally unique, as it was not specified in the dataset description. Adding `userId` as a clustering key allows multiple users to have the same `sessionId` without overwriting each other's data.


#### Creating the table

In [5]:
music_hist_session = "music_history_by_session"
tables.add(music_hist_session)
drop_cs_table(session, music_hist_session)  # drop the table if it exists
music_hist_session_cnames = [
    "sessionId",
    "itemInSession",
    "userId",
    "artistName",
    "songTitle",
    "songLength",
]
music_hist_session_column_types = [
    "int",
    "int",
    "int",
    "text",
    "text",
    "float",
]
music_hist_session_columns = dict(
    zip(music_hist_session_cnames, music_hist_session_column_types)
)
primary_keys = ["sessionId", "itemInSession", "userId"]
create_cs_table(
    session,
    music_hist_session,
    music_hist_session_columns,
    primary_keys,
)

CREATE TABLE IF NOT EXISTS music_history_by_session (sessionId int, itemInSession int, userId int, artistName text, songTitle text, songLength float, PRIMARY KEY (sessionId, itemInSession, userId))


#### Inserting data

In [6]:
time.sleep(2)  # wait for table to be created in (local) Cassandra
i = 0
with open(datafile_path, encoding="utf8") as f:
    csvreader = csv.reader(f)
    next(csvreader)  # skip header
    for line in csvreader:
        row = (
            int(line[datafile_header_map["sessionId"]]),
            int(line[datafile_header_map["itemInSession"]]),
            int(line[datafile_header_map["userId"]]),
            line[datafile_header_map["artistName"]],
            line[datafile_header_map["songTitle"]],
            float(line[datafile_header_map["songLength"]]),
        )
        progress_msg = f"Progress: Inserting row {i}/{num_filtered_rows} -> {row}"
        print(progress_msg, end="\r")
        insert_cs_rows(session, music_hist_session, music_hist_session_cnames, [row])
        print(" " * len(progress_msg), end="\r")  # clear line
        i += 1

                                                                                                                                                                                                               

#### Verification of data

In [7]:
query = """
SELECT artistName, songTitle, songLength 
FROM music_history_by_session 
WHERE sessionId = 338 AND itemInSession = 4;
"""
try:
    rows = session.execute(query)
except Exception as e:
    print(e)

data = [(row.artistname, row.songtitle, row.songlength) for row in rows]
df = pd.DataFrame(data, columns=["Artist", "Song", "Length (s)"])
print(df.to_string(index=False))

   Artist                            Song  Length (s)
Faithless Music Matters (Mark Knight Dub)  495.307312


### Query 2

**Assignment**: Give only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182

**Query**:
```sql
SELECT artistName, songTitle, userFirstName, userLastName 
FROM music_history_by_user
WHERE userId = 10 AND sessionId = 182 
ORDER BY itemInSession;
```

**Why this table fits the query**
- Composite partition key `(userId, sessionId)` keeps each user-session together and prevents wide partitions across many sessions for one user.
- Clustering on `itemInSession` preserves order within the session and keeps rows unique.


#### Creating the table

In [8]:
music_hist_user = "music_history_by_user"
tables.add(music_hist_user)
drop_cs_table(session, music_hist_user)  # drop the table if it exists
music_hist_user_cnames = [
    "userId",
    "sessionId",
    "itemInSession",
    "artistName",
    "songTitle",
    "userFirstName",
    "userLastName",
]
music_hist_user_column_types = [
    "int",
    "int",
    "int",
    "text",
    "text",
    "text",
    "text",
]
music_hist_user_columns = dict(
    zip(music_hist_user_cnames, music_hist_user_column_types)
)
primary_keys = ["(userId, sessionId)", "itemInSession"]
create_cs_table(
    session,
    music_hist_user,
    music_hist_user_columns,
    primary_keys,
)

CREATE TABLE IF NOT EXISTS music_history_by_user (userId int, sessionId int, itemInSession int, artistName text, songTitle text, userFirstName text, userLastName text, PRIMARY KEY ((userId, sessionId), itemInSession))


#### Inserting data

In [9]:
time.sleep(2)  # wait for table to be created in (local) Cassandra
i = 0
with open(datafile_path, encoding="utf8") as f:
    csvreader = csv.reader(f)
    next(csvreader)  # skip header
    for line in csvreader:
        row = (
            int(line[datafile_header_map["userId"]]),
            int(line[datafile_header_map["sessionId"]]),
            int(line[datafile_header_map["itemInSession"]]),
            line[datafile_header_map["artistName"]],
            line[datafile_header_map["songTitle"]],
            line[datafile_header_map["userFirstName"]],
            line[datafile_header_map["userLastName"]],
        )
        progress_msg = f"Progress: Inserting row {i}/{num_filtered_rows} -> {row}"
        print(progress_msg, end="\r")
        insert_cs_rows(session, music_hist_user, music_hist_user_cnames, [row])
        print(" " * len(progress_msg), end="\r")  # clear line
        i += 1

                                                                                                                                                                                                                        

#### Verification of data

In [10]:
query = """
SELECT artistName, songTitle, userFirstName, userLastName 
FROM music_history_by_user
WHERE userId = 10 AND sessionId = 182 
ORDER BY itemInSession;
"""
try:
    rows = session.execute(query)
except Exception as e:
    print(e)

data = [
    (row.artistname, row.songtitle, row.userfirstname, row.userlastname) for row in rows
]
df = pd.DataFrame(data, columns=["Artist", "Song", "First Name", "Last Name"])
print(df.to_string(index=False))

           Artist                                                 Song First Name Last Name
 Down To The Bone                                   Keep On Keepin' On     Sylvie      Cruz
     Three Drives                                          Greece 2000     Sylvie      Cruz
Sebastien Tellier                                            Kilometer     Sylvie      Cruz
    Lonnie Gordon Catch You Baby (Steve Pitron & Max Sanna Radio Edit)     Sylvie      Cruz


### Query 3

**Assignment**: Give every user name (first and last) in the music app history who listened to the song 'All Hands Against His Own'

**Query**:
```sql
SELECT userFirstName, userLastName 
FROM music_history_by_song
WHERE songTitle = 'All Hands Against His Own';
```

**Why this table fits the query**
- Partition on `songTitle` aligns with the `WHERE songTitle = ...` filter. However, since multiple users can listen to the same song, a simple partition key on `songTitle` would lead to collisions.
- Clustering on `userId`, `sessionId`, and `itemInSession` keeps multiple listens unique across users and sessions while allowing repeated plays of the same song.
- A downside is that this table could become very wide for popular songs, but it meets the query requirements.

#### Creating the table

In [11]:
music_hist_song = "music_history_by_song"
tables.add(music_hist_song)
drop_cs_table(session, music_hist_song)  # drop the table if it exists
music_hist_song_cnames = [
    "songTitle",
    "userId",
    "sessionId",
    "itemInSession",
    "userFirstName",
    "userLastName",
]
music_hist_song_column_types = [
    "text",
    "int",
    "int",
    "int",
    "text",
    "text",
]
music_hist_song_columns = dict(
    zip(music_hist_song_cnames, music_hist_song_column_types)
)
primary_keys = ["songTitle", "userId", "sessionId", "itemInSession"]
create_cs_table(
    session,
    music_hist_song,
    music_hist_song_columns,
    primary_keys,
)

CREATE TABLE IF NOT EXISTS music_history_by_song (songTitle text, userId int, sessionId int, itemInSession int, userFirstName text, userLastName text, PRIMARY KEY (songTitle, userId, sessionId, itemInSession))


#### Inserting data

In [12]:
time.sleep(2)  # wait for tables to be created in (local) Cassandra
i = 0
with open(datafile_path, encoding="utf8") as f:
    csvreader = csv.reader(f)
    next(csvreader)  # skip header
    for line in csvreader:
        row = (
            line[datafile_header_map["songTitle"]],
            int(line[datafile_header_map["userId"]]),
            int(line[datafile_header_map["sessionId"]]),
            int(line[datafile_header_map["itemInSession"]]),
            line[datafile_header_map["userFirstName"]],
            line[datafile_header_map["userLastName"]],
        )
        progress_msg = f"Progress: Inserting row {i}/{num_filtered_rows} -> {row}"
        print(progress_msg, end="\r")
        insert_cs_rows(session, music_hist_song, music_hist_song_cnames, [row])
        print(" " * len(progress_msg), end="\r")  # clear line
        i += 1

                                                                                                                                                                    

#### Verification of data

In [13]:
query = """
SELECT userFirstName, userLastName 
FROM music_history_by_song
WHERE songTitle = 'All Hands Against His Own';
"""
try:
    rows = session.execute(query)
except Exception as e:
    print(e)

data = [(row.userfirstname, row.userlastname) for row in rows]
df = pd.DataFrame(data, columns=["First Name", "Last Name"])
print(df.to_string(index=False))

First Name Last Name
Jacqueline     Lynch
     Tegan    Levine
      Sara   Johnson


### Drop the tables before closing out the sessions

In [14]:
for table in tables:
    drop_cs_table(session, table)

### Close the session and cluster connectionÂ¶

In [15]:
close_cs_session(session)
shutdown_cs_cluster(cluster)