## Learn about the data in a csv file before importing into MySQL

We'll review the data in our csv file so we know which tables to create in our MySQL database.


In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
spotify_df = pd.read_csv('spotify-data/genresv2.csv', low_memory=False)

spotify_df.loc[100]

Unnamed: 0                             100
danceability                         0.832
energy                               0.802
key                                     11
loudness                            -4.039
mode                                     0
speechiness                           0.22
acousticness                         0.343
instrumentalness                   0.00361
liveness                             0.139
valence                              0.178
tempo                              220.012
type                        audio_features
id                  3lTAwJ7GAsm6mFkRejOXIm
duration_ms                         166798
time_signature                           4
genre                            Dark Trap
song_name               Plague Doctor Mask
Unnamed: 0.1                           NaN
title                                  NaN
Name: 100, dtype: object

Now that we've loaded our data, we'll see what types of data we have in our dataset

In [39]:
spotify_df.describe()

spotify_df.dtypes




Unnamed: 0            int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
type                 object
id                   object
duration_ms           int64
time_signature        int64
genre                object
song_name            object
Unnamed: 0.1        float64
title                object
dtype: object

In [40]:
spotify_df.dtypes

Unnamed: 0            int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
type                 object
id                   object
duration_ms           int64
time_signature        int64
genre                object
song_name            object
Unnamed: 0.1        float64
title                object
dtype: object

In [41]:
spotify_df.mode

<bound method DataFrame.mode of        Unnamed: 0  danceability  energy  key  loudness  mode  speechiness  \
0               0         0.831   0.814    2    -7.364     1       0.4200   
1               1         0.719   0.493    8    -7.230     1       0.0794   
2               2         0.850   0.893    5    -4.783     1       0.0623   
3               3         0.476   0.781    0    -4.710     1       0.1030   
4               4         0.798   0.624    2    -7.668     1       0.2930   
...           ...           ...     ...  ...       ...   ...          ...   
42300       42300         0.528   0.693    4    -5.148     1       0.0304   
42301       42301         0.517   0.768    0    -7.922     0       0.0479   
42302       42302         0.361   0.821    8    -3.102     1       0.0505   
42303       42303         0.477   0.921    6    -4.777     0       0.0392   
42304       42304         0.529   0.945    9    -5.862     1       0.0615   

       acousticness  instrumentalness  live

In [42]:
spotify_df.describe() 



Unnamed: 0.2,Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Unnamed: 0.1
count,42305.0,42305.0,42305.0,42305.0,42305.0,42305.0,42305.0,42305.0,42305.0,42305.0,42305.0,42305.0,42305.0,42305.0,20780.0
mean,21152.0,0.639364,0.762516,5.37024,-6.465442,0.549462,0.136561,0.09616,0.283048,0.214079,0.357101,147.474056,250865.846685,3.97258,10483.970645
std,12212.545906,0.156617,0.183823,3.666145,2.941165,0.497553,0.126168,0.170827,0.370791,0.175576,0.2332,23.844623,102957.713571,0.268342,6052.359519
min,0.0,0.0651,0.000243,0.0,-33.357,0.0,0.0227,1e-06,0.0,0.0107,0.0187,57.967,25600.0,1.0,0.0
25%,10576.0,0.524,0.632,1.0,-8.161,0.0,0.0491,0.00173,0.0,0.0996,0.161,129.931,179840.0,4.0,5255.75
50%,21152.0,0.646,0.803,6.0,-6.234,1.0,0.0755,0.0164,0.00594,0.135,0.322,144.973,224760.0,4.0,10479.5
75%,31728.0,0.766,0.923,9.0,-4.513,1.0,0.193,0.107,0.722,0.294,0.522,161.464,301133.0,4.0,15709.25
max,42304.0,0.988,1.0,11.0,3.148,1.0,0.946,0.988,0.989,0.988,0.988,220.29,913052.0,5.0,20999.0


Since we're not sure what will happen if we try to import "Object" types into MySQL, we'll create tables that store those values as strings. 

CREATE TABLE Persons (
    PersonID int,
    LastName varchar(255),
    FirstName varchar(255),
    Address varchar(255),
    City varchar(255)
);

## Getting data ready to create our "genres" table

We're first going to create a dictionary for this and check to make sure when we create this content we create the right data type

In [43]:
sql_table_vars= dict(spotify_df.dtypes)



final_keys = list(sql_table_vars.keys())

draft_values = []


for value in sql_table_vars.values():
    draft_values.append(str(value))

## change the value of objet to str since sql does not have this data type (or i don't know how to insert into it)

final_values = []
for item in draft_values:
    if item == 'object':
        final_values.append('char')
    elif item == 'float64' or item == 'int64':
        final_values.append('float')
    else:
        final_values.append(item)


final_dict= dict(zip(final_keys, final_values))

for k in final_dict:
    
    print (k, final_dict[k])
    

Unnamed: 0 float
danceability float
energy float
key float
loudness float
mode float
speechiness float
acousticness float
instrumentalness float
liveness float
valence float
tempo float
type char
id char
duration_ms float
time_signature float
genre char
song_name char
Unnamed: 0.1 float
title char


In [44]:
print ("CREATE TABLE genres(")
for item in final_dict:
    print (item, final_dict[item]+",")
print (");")


## figure out the issues iwth key, mode, type (rename them)

CREATE TABLE genres(
Unnamed: 0 float,
danceability float,
energy float,
key float,
loudness float,
mode float,
speechiness float,
acousticness float,
instrumentalness float,
liveness float,
valence float,
tempo float,
type char,
id char,
duration_ms float,
time_signature float,
genre char,
song_name char,
Unnamed: 0.1 float,
title char,
);


## Now we need to rename the variables that have reserved SQL words as the field names so that we can create the SQL table

key  -> 2, song_key 
mode -> 4, # what is in this data?
type -> 11 #what is this data? 

We'll need the index numbers to relabel the keys 

In [45]:
index_num = -1

for i in final_keys:
    index_num=index_num+1
    print (i, index_num)


Unnamed: 0 0
danceability 1
energy 2
key 3
loudness 4
mode 5
speechiness 6
acousticness 7
instrumentalness 8
liveness 9
valence 10
tempo 11
type 12
id 13
duration_ms 14
time_signature 15
genre 16
song_name 17
Unnamed: 0.1 18
title 19


### Now that we know the indexes for the variables with names that need to be changed, we'll find out what they are so we create names that make sense

In [46]:
result = spotify_df['key'].describe()
result

count    42305.000000
mean         5.370240
std          3.666145
min          0.000000
25%          1.000000
50%          6.000000
75%          9.000000
max         11.000000
Name: key, dtype: float64

It's unclear what these values mean- we'll label this as "to remove" for now and, once our csv file is imported into our table, drop the column

In [47]:
result = spotify_df['mode'].describe()
result

count    42305.000000
mean         0.549462
std          0.497553
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: mode, dtype: float64

It's unclear what these values mean- we'll label this as "to remove" for now and, once our csv file is imported into our table, drop the column

In [48]:
result = spotify_df['duration_ms'].describe()
result


count     42305.000000
mean     250865.846685
std      102957.713571
min       25600.000000
25%      179840.000000
50%      224760.000000
75%      301133.000000
max      913052.000000
Name: duration_ms, dtype: float64

This one is also not very interesting- we'll drop it once the data is imported in MySQL

In [49]:
for i in range(len(final_keys)):
  
    # replace key with key_to_remove
    if final_keys[i] == 'key_to-delete':
        final_keys[i] = 'key_to_delete'
  
    # replace pant with ishan
    if final_keys[i] == 'mode_to-delete':
        final_keys[i] = 'mode_to_delete'


        # replace pant with ishan
    if final_keys[i] == 'type_to-delete':
        final_keys[i] = 'type_to_delete' 
        
    
final_keys

['Unnamed: 0',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'type',
 'id',
 'duration_ms',
 'time_signature',
 'genre',
 'song_name',
 'Unnamed: 0.1',
 'title']

### Create new content for create statement

In [50]:
final_dict= dict(zip(final_keys, final_values))

print (final_keys)
print ("CREATE TABLE genres(")
for item in final_dict:
    print (item, final_dict[item]+",")
print (");")



['Unnamed: 0', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'duration_ms', 'time_signature', 'genre', 'song_name', 'Unnamed: 0.1', 'title']
CREATE TABLE genres(
Unnamed: 0 float,
danceability float,
energy float,
key float,
loudness float,
mode float,
speechiness float,
acousticness float,
instrumentalness float,
liveness float,
valence float,
tempo float,
type char,
id char,
duration_ms float,
time_signature float,
genre char,
song_name char,
Unnamed: 0.1 float,
title char,
);


In [51]:
spotify_df_clean = spotify_df.replace(',','', regex=True)


In [52]:
spotify_df_new = pd.read_csv('spotify-data/genres-final.csv', low_memory=False)
spotify_df_new = spotify_df_new.replace(',','', regex=True)
spotify_df_new.to_csv('/Users/rebeccahayes/Projects/github/prog-notes/sql/spotify/spotify-data/genres-final.csv')






In [53]:
spotify_df_new = spotify_df_new.replace(np.nan, '', regex=True) # All data frame

spotify_df_new.to_csv('/Users/rebeccahayes/Projects/github/prog-notes/sql/spotify/spotify-data/genres-final.csv')

spotify_df_new['type'].describe()





count              42305
unique                 1
top       audio_features
freq               42305
Name: type, dtype: object

In [55]:
spotify_df.loc[4560]


Unnamed: 0                            4560
danceability                         0.487
energy                               0.373
key                                      5
loudness                           -10.327
mode                                     0
speechiness                          0.046
acousticness                         0.205
instrumentalness                   0.00232
liveness                            0.0772
valence                              0.313
tempo                              193.544
type                        audio_features
id                  5qAWdile6xrEs9R06YQL6l
duration_ms                         146667
time_signature                           5
genre                            Dark Trap
song_name               Theonlytimeiseeyou
Unnamed: 0.1                           NaN
title                                  NaN
Name: 4560, dtype: object