### Preprocessing Data

#### Creating dummy variables

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [3]:
music_df = pd.read_csv('music_clean.csv')

In [20]:

print(music_df.shape)
#music_df.drop('Unnamed: 0',axis=1, inplace=True)
print(music_df.head())

(1000, 12)
   popularity  acousticness  danceability  duration_ms  energy  \
0        60.0      0.896000         0.726     214547.0   0.177   
1        63.0      0.003840         0.635     190448.0   0.908   
2        59.0      0.000075         0.352     456320.0   0.956   
3        54.0      0.945000         0.488     352280.0   0.326   
4        55.0      0.245000         0.667     273693.0   0.647   

   instrumentalness  liveness  loudness  speechiness    tempo  valence  genre  
0          0.000002    0.1160   -14.824       0.0353   92.934    0.618      1  
1          0.083400    0.2390    -4.795       0.0563  110.012    0.637      1  
2          0.020300    0.1250    -3.634       0.1490  122.897    0.228      1  
3          0.015700    0.1190   -12.020       0.0328  106.063    0.323      1  
4          0.000297    0.0633    -7.787       0.0487  143.995    0.300      1  


##### df_dummies = pd.get_dummies(df['column'], drop_first = True)
##### df_dummies = pd.concat([df, df_dummies], axis = 1)
##### print(f"The shape of dataframe dummies: {df_dummies.shape}")
##### print(df_dummies)

##### For a df with only one categorical variable:
##### df_dummies = pd.get_dummies(df, drop_first = True)

In [7]:
from sklearn.linear_model import Ridge

In [15]:
X = music_df.drop('popularity',axis=1)
y = music_df.popularity



ridge = Ridge(alpha = 0.2)
kf = KFold(n_splits=6, shuffle=True, random_state=123)
cv = cross_val_score(ridge, X, y, cv=kf, scoring = 'neg_mean_squared_error')#This changes the scoring metod from default r2 to -mse
rmse = np.sqrt(-cv)

    
print(f"Average RMSE: {np.mean(rmse)}",'\n',f"Standard deviation for target array: {np.std(y)}")

Average RMSE: 10.29418369242257 
 Standard deviation for target array: 14.02156909907019


##### Handling missing data

In [24]:
# df.isna().sum().sort_values()
# df.dropna(subset=[list_of_columns_with_missing_values])

#Only use this for data accounting for <= 5% of the total sample.


1    500
0    500
Name: genre, dtype: int64

##### Imputing missing data 
##### Imputation replaces all missing data in a column with an educated guess( usually the mean for numeric variables, and the most frequent value-or mode- for categorical variables)


(Imputation has to be done after splitting the data to avoid data leakage- information from test set being revealed in the traning set before fiitting).



##### Imputation for categorical data using most frequent strategy

In [22]:
# X_cat = df[[list_containing_column_names_of_categorial_features]].values
# y = df.target.values
# X_cat_train, X_cat_test, y_train, y_test = train_test_split(X_cat, y, test_size = 0.2, random_state = 42)
# imp_cat = SimpleImputer(strategy = 'most_frequent')
# X_cat_train = imp_cat.fit_transform(X_cat_train)
# X_cat_test = imp_cat.transform(X_cat_test)


##### Imputation for numeric variables using the default mean (median can also be used)



In [21]:
# X_num = df[[list_containing_column_names_of_numeric_features]].values
# X_num_train, X_num_test, y_train, y_test = train_test_split(X_num, y, test_size = 0.2, random_state = 42)
# imp_num = SimpleImputer()
# X_num_train = num_cat.fit_transform(X_cat_train)
# X_num_test = num_cat.transform(X_cat_test)

Then, to create a unified X_train containing all features with imputed values for missing data


In [23]:
#  X_train = np.append(X_num_train, X_cat_train, axis = 1)
# X_test = np.append(X_num_test, X_cat_test, axis = 1)

##### Imputting with a Pipeline (Transform and fit a model at once)
##### In a pipeline, each step but the last must be a ttransformer.

In [26]:
#steps = [('imputer',SimpleImputer(strategy='mean')),
#         ('model', LogisticRegression())]
#pipeline = Pipeline(steps)
#X_train, X_test, y_train, y_test = train_test_split(X,y,...)
#pipeline.fit(X_train, y_train)
#predictions = pipeline.predict(X_test)

#### Centering and Scaling
Scaling our data by standardization ( (x-mean)/variance) for all values in each column.

In [27]:
music_df.describe()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,51.66,0.258649,0.542602,217220.4,0.636464,0.137289,0.199993,-8.253305,0.077879,120.3684,0.480057,0.5
std,14.028585,0.307494,0.160322,117558.2,0.237789,0.285558,0.160435,5.158523,0.089451,28.94213,0.237854,0.50025
min,0.0,3e-06,0.0624,-1.0,0.00251,0.0,0.0254,-38.718,0.0234,56.855,0.0298,0.0
25%,43.75,0.013275,0.444,180656.2,0.48575,0.0,0.1,-9.7755,0.0331,95.90975,0.3065,0.0
50%,54.0,0.116,0.5485,216300.0,0.6765,8.9e-05,0.131,-6.855,0.0436,119.952961,0.4735,0.5
75%,62.0,0.4265,0.657,260502.5,0.8225,0.042825,0.27325,-4.97775,0.07495,140.033,0.654,1.0
max,82.0,0.996,0.95,1617333.0,0.995,0.975,0.991,-0.883,0.71,207.852,0.968,1.0


In [None]:
from sklearn.preprocessing import StandardScaler

steps = [("scaler", StandardScaler()),
        ('lasso', Lasso(alpha=0.5))]

pipeline = Pipeline(steps)

X = music_df.drop('genre',axis=1)

pipeline.fit