## Our Voyager Data Clean Up

In [2]:
# import libraries

import pandas as pd

# supress warnings
import warnings
warnings.filterwarnings('ignore')

## Load CSV Sheets

In [4]:
# the url
our_messy_metadata_csv = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTgFKikka0dJL3HoDat6kCnSKfVHjOTectJqqNiKhCrByJ9ciVYhEwDt8WpyjrHgcd62IEUi20-L-eN/pub?output=csv'

# load it to a Pandas dataframe
our_messy_metadata = pd.read_csv(our_messy_metadata_csv)

### Check Columns

In [5]:
our_messy_metadata.columns

Index(['Timestamp', 'Song Title', 'Your Name', 'Your Graduating Class',
       'Team/Group Name', 'Personal Rank', 'Genre Tags',
       'Most Likely Context(s)', 'Least Likely Contexts'],
      dtype='object')

In [6]:
our_messy_metadata.head()

Unnamed: 0,Timestamp,Song Title,Your Name,Your Graduating Class,Team/Group Name,Personal Rank,Genre Tags,Most Likely Context(s),Least Likely Contexts


In [13]:
# let's drop the time stamp, which is just a product of creating the form
our_messy_metadata = our_messy_metadata.drop(columns=['Timestamp'])

## 2. Clean the Easy Parts

- Your Name
- Team/Group
- Rank
- Class Year


In [8]:
def name_lower(name):
    # lower case
    name = name.lower()
    # remove _ in one name before comma
    name = name.replace("_,", ",")
    # remove extra space
    name = name.replace("_", " ")
    return name

# reverse names so we have surname first
def reverse_name_without_comma(name):
    words = name.split()
    return ', '.join(reversed(words))


# clean up team names
def team_clean(team):
    if "G" in team:
        team = "green machine"
    elif "cr" in team:
        team = "orange crush"
    elif "p" in team or "P" in team:
        team = "purple phlurp"
    return team

# clean up rankings
def rank_clean(rank):
    if "1" in rank:
        rank = 1
    elif "2" in rank:
        rank = 2
    elif "3" in rank:
        rank = 3
    elif "4" in rank:
        rank = 4
    elif "5" in rank:
        rank = 5
    elif "6" in rank:
        rank = 6
    else:
        rank = 'unranked'
    return rank

def year_clean(year):
    if "5" in year:
        year = 2025
    elif "6" in year:
        year = 2026
    elif "7" in year:
        year = 2027
    elif "8" in year:
        year = 2028
    return year



### Apply the Cleaning Functions

In [9]:

# apply lower case functions
our_messy_metadata['Your Name'] = our_messy_metadata['Your Name'].apply(name_lower)
# Apply the function to the 'Name' column
our_messy_metadata['Your Name'] = our_messy_metadata['Your Name'].apply(lambda x: x if ',' in x else reverse_name_without_comma(x))
# Apply the function to the 'Name' column
our_messy_metadata['Team/Group Name'] = our_messy_metadata['Team/Group Name'].apply(team_clean)
# apply the rank function function
our_messy_metadata['Personal Rank'] = our_messy_metadata['Personal Rank'].apply(rank_clean)
# apply class clean
our_messy_metadata['Your Graduating Class'] = our_messy_metadata['Your Graduating Class'].apply(year_clean)

## 3. Clean Genre and Context Columns


In [14]:
# depending on how the terms are separated, we can replace them with commas

def regular_separator(terms):
    terms = terms.replace("?", ", ")
    terms = terms.replace(";", ", ")
    return terms

In [15]:
# lower case all genres
our_messy_metadata['Genre Tags'] = our_messy_metadata['Genre Tags'].str.lower()
# regularize separator
our_messy_metadata['Genre Tags'] = our_messy_metadata['Genre Tags'].apply(regular_separator)


# now the same for most likely contexts
our_messy_metadata['Most Likely Context(s)'] = our_messy_metadata['Most Likely Context(s)'].str.lower()
our_messy_metadata['Most Likely Context(s)'] = our_messy_metadata['Most Likely Context(s)'].apply(regular_separator)

# and for the least likely contexts
our_messy_metadata['Least Likely Contexts'] = our_messy_metadata['Least Likely Contexts'].str.lower()
our_messy_metadata['Least Likely Contexts'] = our_messy_metadata['Least Likely Contexts'].apply(regular_separator)



In [16]:
# now the genre tags are long strings, separated by commas. We can split these into lists, which will make them easier to work with later!

# split genre tags to lists, on ","
our_messy_metadata['Genre Tags'] = our_messy_metadata['Genre Tags'].str.split(',')  
# split most likely contexts to lists, on ","
our_messy_metadata['Most Likely Context(s)'] = our_messy_metadata['Most Likely Context(s)'].str.split(',')
# split least likely contexts to lists, on ","
our_messy_metadata['Least Likely Contexts'] = our_messy_metadata['Least Likely Contexts'].str.split(',')


### Possible Genre Mapping to High Level Categories

Previously use used some basic techniques reformat our data, setting names as lower case, or splitting up Here is an example of how to do that using a dictionary and the `map` function in Pandas:
```python
# Define dictionary mapping terms to labels
term_mapping = {
    'apple': 'fruit',
    'banana': 'fruit',
    'cherry': 'fruit',
    'date': 'fruit',
    'elderberry': 'fruit'
}

# Add new column based on term mapping
df['label'] = df['key'].map(term_mapping).fillna('other')
```

## Tidy Data Steps would involve exploding the lists into multiple rows, but we will not do that here.