# Prepare Functions
Functions to create:
- splitting into train, validate, test
- scale train, validate, test
- create clusters with kmeans

In [1]:
import pandas as pd

In [2]:
pd.set_option("display.max_colwidth", 10000)

In [3]:
df = pd.read_csv('sample_data.csv')

In [4]:
df.head(1)

Unnamed: 0,album,artists,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,is_playable,name,popularity,preview_url,track_number,type,uri
0,"{'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/36QJpDe2go2KgaRleHCDTp'}, 'href': 'https://api.spotify.com/v1/artists/36QJpDe2go2KgaRleHCDTp', 'id': '36QJpDe2go2KgaRleHCDTp', 'name': 'Led Zeppelin', 'type': 'artist', 'uri': 'spotify:artist:36QJpDe2go2KgaRleHCDTp'}], 'external_urls': {'spotify': 'https://open.spotify.com/album/44Ig8dzqOkvkGDzaUof9lK'}, 'href': 'https://api.spotify.com/v1/albums/44Ig8dzqOkvkGDzaUof9lK', 'id': '44Ig8dzqOkvkGDzaUof9lK', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab67616d0000b273c8a11e48c91a982d086afc69', 'width': 640}, {'height': 300, 'url': 'https://i.scdn.co/image/ab67616d00001e02c8a11e48c91a982d086afc69', 'width': 300}, {'height': 64, 'url': 'https://i.scdn.co/image/ab67616d00004851c8a11e48c91a982d086afc69', 'width': 64}], 'name': 'Led Zeppelin IV (Deluxe Edition)', 'release_date': '1971-11-08', 'release_date_precision': 'day', 'total_tracks': 16, 'type': 'album', 'uri': 'spotify:album:44Ig8dzqOkvkGDzaUof9lK'}","[{'external_urls': {'spotify': 'https://open.spotify.com/artist/36QJpDe2go2KgaRleHCDTp'}, 'href': 'https://api.spotify.com/v1/artists/36QJpDe2go2KgaRleHCDTp', 'id': '36QJpDe2go2KgaRleHCDTp', 'name': 'Led Zeppelin', 'type': 'artist', 'uri': 'spotify:artist:36QJpDe2go2KgaRleHCDTp'}]",1,482830,False,{'isrc': 'USAT21300959'},{'spotify': 'https://open.spotify.com/track/5CQ30WqJwcep0pYcV4AMNc'},https://api.spotify.com/v1/tracks/5CQ30WqJwcep0pYcV4AMNc,5CQ30WqJwcep0pYcV4AMNc,False,True,Stairway to Heaven - Remaster,78,https://p.scdn.co/mp3-preview/8226164717312bc411f8635580562d67e191a754?cid=940e516de56949eab620dd24470e2104,4,track,spotify:track:5CQ30WqJwcep0pYcV4AMNc


## Split DF
- using sklearn split functions to split df into 70% train, 20% validate, 10% test

In [5]:
# function to split data into train, validate, and test
def split_df(df):

    '''
    Splits dataframe into train, validate, and test - 70%, 20%, 10% respectively.
    Prints out the percentage shape and row/column shape of the split dataframes.
    Returns train, validate, test.
    '''

    # Import to use split function, can only split two at a time
    from sklearn.model_selection import train_test_split

    # First, split into train + validate together and test by itself
    # Test will be %10 of the data, train + validate is %70 for now
    # Set random_state so we can reproduce the same 'random' data
    train_validate, test = train_test_split(df, test_size = .10, random_state = 666)

    # Second, split train + validate into their seperate dataframes
    # Train will be %70 of the data, Validate will be %20 of the data
    # Set random_state so we can reproduce the same 'random' data
    train, validate = train_test_split(train_validate, test_size = .22, random_state = 666)

    # These two print functions allow us to ensure the date is properly split
    # Will print the shape of each variable when running the function
    print("train shape: ", train.shape, ", validate shape: ", validate.shape, ", test shape: ", test.shape)

    # Will print the shape of each variable as a percentage of the total data set
    # Variable to hold the sum of all rows (total observations in the data)
    total = df.count()[0]
    
    #calculating percentages of the split df to the original df
    train_percent = round(((train.shape[0])/total),2) * 100
    validate_percent = round(((validate.shape[0])/total),2) * 100
    test_percent = round(((test.shape[0])/total),2) * 100
    
    print("\ntrain percent: ", train_percent, ", validate percent: ", validate_percent, 
            ", test percent: ", test_percent)

    return train, validate, test

In [6]:
train, validate, test = split_df(df)

train shape:  (7, 17) , validate shape:  (2, 17) , test shape:  (1, 17)

train percent:  70.0 , validate percent:  20.0 , test percent:  10.0


## Scale the Data
- MinMaxScaler
    - a linear scaling method that transforms our features such that the range is between 0 and 1
    
- Standard
    - standardization is a linear transformation of our data such that is looks like the standard normal distribution
    - it will have a mean of 0 and a standard deviation of 1
    - scales and then centers
    
- RobustScaler
    - another linear transformation more robust to outliers
    
### Function to scale:
- takes train, validate, and test and returns the three scaled versions
- parameter for predict drops the specified predictive variable in the df, in this case popularity
- parameter for scaler specifies to use MinMax, Standard, or Robust Scaler

In [7]:
def scale_data(train, validate, test, predict, scaler):

    '''
    Scales a df based on scaler chosen: 'MinMax', 'Standard', or 'Robust'. 
    Needs three dfs, train, validate, and test. Fits the scaler object to train 
    only, transforms on all 3. Returns the three dfs scaled.
    '''
    
    import sklearn.preprocessing
    
    # removing predictive feature
    X_train = train.drop(predict, axis=1)
    X_validate = validate.drop(predict, axis=1)
    X_test = test.drop(predict, axis=1)
    
    if scaler == 'MinMax':

        # create scaler object for MinMax Scaler
        scaler = sklearn.preprocessing.MinMaxScaler()
        
    elif scaler == 'Standard':
        
        # create scaler object for Standard Scaler
        scaler = sklearn.preprocessing.StandardScaler()
        
    elif scaler == 'Robust':
        
        # create scaler object for Robust Scaler
        scaler = sklearn.preprocessing.StandardScaler()
        
    # Note that we only call .fit with the training data,
    # but we use .transform to apply the scaling to all the data splits.
    scaler.fit(X_train)

    # transforming all three dfs with the scaler object
    # this turns it into an array
    X_train_scaled = scaler.transform(X_train)
    X_validate_scaled = scaler.transform(X_validate)
    X_test_scaled = scaler.transform(X_test)

    # converting scaled array back to df
    # first by converting to a df, it will not have the original index and column names
    X_train_scaled = pd.DataFrame(X_train_scaled)
    X_validate_scaled = pd.DataFrame(X_validate_scaled)
    X_test_scaled = pd.DataFrame(X_test_scaled)
        
    # setting index to original dfs
    X_train_scaled.index = X_train.index
    X_validate_scaled.index = X_validate.index
    X_test_scaled.index = X_test.index
        
    # renaming columns to original dfs
    X_train_scaled.columns = X_train.columns
    X_validate_scaled.columns = X_validate.columns
    X_test_scaled.columns = X_test.columns

    return X_train_scaled, X_validate_scaled, X_test_scaled

In [8]:
train.describe().columns.to_list()

['disc_number', 'duration_ms', 'popularity', 'track_number']

In [9]:
# testing scaling on numeric columns
columns = train.describe().columns.to_list()
train = train[columns]
validate = validate[columns]
test = test[columns]

In [10]:
train

Unnamed: 0,disc_number,duration_ms,popularity,track_number
2,1,146250,76,1
6,1,166266,69,1
4,1,263333,70,7
0,1,482830,78,4
1,1,333893,77,1
8,1,212160,67,7
9,1,351266,64,6


In [11]:
X_train_scaled, X_validate_scaled, X_test_scaled = scale_data(train, validate, test, 'popularity','MinMax')

In [12]:
X_train_scaled

Unnamed: 0,disc_number,duration_ms,track_number
2,0.0,0.0,0.0
6,0.0,0.059469,0.0
4,0.0,0.347861,1.0
0,0.0,1.0,0.5
1,0.0,0.557499,0.0
8,0.0,0.195823,1.0
9,0.0,0.609115,0.833333


# Cluster Function
- create clusters using kmeans modeling
- make into function that can be used on any features chosen throughout the project pipeline process
- with small amount of data, function works with train
    - not enough observations to create clusters on validate and test
    - commenting out validate and test for now

In [13]:
def create_clusters(X_train_scaled, X_validate_scaled, X_test_scaled, features, n, cluster_name):
    
    '''
    Create clusters based on features specified. n is amount of groups within the cluster.
    Best when used on scaled dfs. Returns dfs with dummy variables of clusters appended.
    '''
    
    from sklearn.cluster import KMeans

    X = X_train_scaled[features]
    Y = X_validate_scaled[features]
    Z = X_test_scaled[features]
    
    # create object with clusters chosen by n parameter
    kmeans = KMeans(n_clusters=n, random_state = 666)

    # fit to train only and the features chosen
    kmeans.fit(X)
    
    # add a column to the dfs of the prediction of cluster group
    X_train_scaled[cluster_name] = kmeans.predict(X)
    X_validate_scaled[cluster_name] = kmeans.predict(Y)
    X_test_scaled[cluster_name] = kmeans.predict(Z)
    
    
    # naming the cluster groups by cluster name plus numbers 1 through n for each group
    columns = []
    for x in range(1, n+1):
        columns.append(f'{cluster_name}_{x}')
    
    
    # create dataframe of dummy variables of cluster group created for each train, validate, test
    # train cluster dummy variables
    dummies = pd.get_dummies(X_train_scaled[cluster_name])
    dummies.columns = columns

    # validate cluster dummy variables
    dummies2 = pd.get_dummies(X_validate_scaled[cluster_name])
    #dummies2.columns = columns

    # test cluster dummy variables
    #dummies3 = pd.get_dummies(X_test_scaled[cluster_name])
    #dummies3.columns = columns
    
    # add cluster dummy variables to scaled df
    # adding train cluster dummies to train scaled df
    X_train_scaled = pd.concat([X_train_scaled,dummies], axis=1)
    
    # adding validate cluster dummies to validate scaled df
    #X_validate_scaled = pd.concat([X_validate_scaled,dummies2], axis=1)
    
    # adding test cluster dummies to test scaled df
    #X_test_scaled = pd.concat([X_test_scaled,dummies3], axis=1)

    return X_train_scaled #, X_validate_scaled#, X_test_scaled

In [14]:
# testing on sample df
features = ['disc_number','duration_ms']
cluster_name = 'disk_number_duration_cluster'
n=4

create_clusters(X_train_scaled, X_validate_scaled, X_test_scaled, features, n, cluster_name)

Unnamed: 0,disc_number,duration_ms,track_number,disk_number_duration_cluster,disk_number_duration_cluster_1,disk_number_duration_cluster_2,disk_number_duration_cluster_3,disk_number_duration_cluster_4
2,0.0,0.0,0.0,3,0,0,0,1
6,0.0,0.059469,0.0,3,0,0,0,1
4,0.0,0.347861,1.0,0,1,0,0,0
0,0.0,1.0,0.5,2,0,0,1,0
1,0.0,0.557499,0.0,1,0,1,0,0
8,0.0,0.195823,1.0,0,1,0,0,0
9,0.0,0.609115,0.833333,1,0,1,0,0
