In [5]:
import pandas as pd
import spacy

In [6]:
raw_df = pd.read_csv('SpotifyFeatures.csv')

In [7]:
def wrangle(df):
    # drop duplicate entries
    df = df.drop_duplicates(subset='track_id')
        
    # drop identifier column
    df = df.drop(columns='track_id')

    # remove '#' from the 'key'column
    df['key'] = df['key'].str.replace('#', '')

    # reindex dataframe (some indexes were lost due to dropping duplicate rows)
    df = df.reset_index()
    # drop the additional 'index' column which was created when reindexing
    df.drop(columns='index', inplace=True)

    return df

In [8]:
df = wrangle(raw_df)

In [None]:
!python3 -m spacy download en_core_web_md

In [11]:
nlp = spacy.load("en_core_web_md")

In [14]:
def tokenizer(text):
    """
    Use the pre-trained model from Space to tokenize our text into lemmas 
    
    Notes
    -----
    Rememeber that the pre-trained spaCy model has a lot of built in flags for what kind of token each token is
    so we can use that functionality to create filters for stop words, white spaces, punctuation, and so on!
    
    See list of flags here: https://spacy.io/api/token#attributes
    
    Parameter
    ---------
    text: string
        Full text article/document that needs to be tokenized 
    """
    tokens = []
    for token in nlp(text):
        # if statement will filter out stopwords, punctuation, and whitespace
        # COMPLETE THE CODE HERE
        if (token.is_stop != True) & (token.is_punct != True ) & (token.is_space != True) & (token.is_digit != True):
            # Now lemmatize!
            tokens.append(token.lemma_)
    
    return ' '.join(tokens)

In [15]:
df['lemmas_track_name'] = df['track_name'].apply(tokenizer)

In [16]:
def vectorize(text):
    """Vectorize the tokenized text"""
    return nlp(text).vector

In [17]:
df['vec_track_name'] = df['track_name'].apply(vectorize)

In [None]:
df.to_csv('tokenized-vectorized-df')

In [None]:
df.to_csv("tokenized-vectorized-compr-df.csv.zip", 
           index=False, 
           compression="zip")

In [None]:
df1 = df.drop(columns=['artist_name', 'track_name'])

In [None]:
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
enc = OrdinalEncoder()
df_encoded = enc.fit_transform(df1)
df_encoded.head()

In [None]:
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_encoded.values), columns=df1.columns)
df_scaled.head()