# Logistic Regression 

## Import Jobs and get Dataframe

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow.parquet as pq

#For Big Query
from google.cloud import bigquery     
from google.oauth2 import service_account

#For ML Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [4]:
# Connect to BQ

credentials = service_account.Credentials.from_service_account_file(
'C:/Users/miria/Desktop/music-recommendation-system-24-3d0d21fb1f8b.json')
# music-recommendation-system-24-3d0d21fb1f8b.json is the service account JSON file. Save the file locally on your device and add the path here. 
# make sure that the slashes in the path are '/' and not '\'

project_id = 'music-recommendation-system-24'
client = bigquery.Client(credentials= credentials,project=project_id)

In [5]:
# Query BQ

query_job = client.query("""
   SELECT 
      *
   FROM `music-recommendation-system-24.ml_tables_eu.song_list_obama_wo_duplicates_view`""")


results = query_job.result() # Wait for the job to complete.

rows = [dict(row) for row in results]

# Convert the list of dictionaries to a DataFrame
df_bq = pd.DataFrame(rows)

In [11]:
df_bq.dtypes


acousticness                float64
danceability                float64
duration_min                float64
energy                      float64
genres                       object
instrumentalness            float64
key_name                     object
liveness                    float64
loudness                    float64
mode                         object
speechiness                 float64
track_album_name             object
track_album_release_year     object
track_artist                 object
track_id                     object
track_name                   object
track_popularity              int64
tempo                         int64
valence                     float64
in_obama_playlist             int64
dtype: object

##Build Model

In [None]:
#split X and y
X = df_bq[['acousticness', 
           'danceability', 
           'duration_min', 
           'energy', 
           'instrumentalness',
           'key_name',
           'liveness',
           'loudness',
           'mode',
           'speechiness',
           'track_artist',
           'track_popularity',
           'tempo',
           'valence']]
#drop: 
# genres (too much difference between origninal tables), 
# track_album_name, 
# track_album_release_year (to much difference between orignal tables)
# track_id & track_name (no value for model)
# in_obama_playlist (is y)
y = df_bq[['in_obama_playlist']]

In [18]:
#split test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
# split numeric and non numeric columns for X_train and X_test
X_train_numeric = X_train.select_dtypes(include=['int64', 'float64'])
X_test_numeric = X_test.select_dtypes(include=['int64', 'float64'])

X_train_non_numeric = X_train.select_dtypes(exclude=['int64', 'float64'])
X_test_non_numeric = X_test.select_dtypes(exclude=['int64', 'float64'])

In [26]:
#Scale numeric columns and put into a dataframe

X_train_numeric_scaled = scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

X_train_numeric_scaled = pd.DataFrame(X_train_numeric_scaled, columns = X_train.select_dtypes(include=['int64', 'float64']).columns)

X_test_numeric_scaled = pd.DataFrame(X_test_numeric_scaled, columns = X_test.select_dtypes(include=['int64', 'float64']).columns)


In [None]:
# categorize non numeric columns and put into a dataframe

X_train_non_numeric_ohe = enc.fit_transform(X_train_non_numeric)
X_test_non_numeric_ohe = enc.transform(X_test_non_numeric)

X_train_non_numeric_ohe = pd.DataFrame(X_train_non_numeric_ohe, columns=enc.get_feature_names_out())

X_test_non_numeric_ohe = pd.DataFrame(X_test_non_numeric_ohe, columns=enc.get_feature_names_out())

In [29]:
X_test_non_numeric_ohe

Unnamed: 0,key_name_A,key_name_A#,key_name_B,key_name_C,key_name_C#,key_name_D,key_name_D#,key_name_E,key_name_F,key_name_F#,...,track_artist_Руслан Черный,track_artist_Сережа Местный,track_artist_オメガトライブ,track_artist_リアムMAZE1981,track_artist_広瀬大地,track_artist_片寄涼太,track_artist_真之介,track_artist_空音,track_artist_竹内アンナ,track_artist_落日飛車 Sunset Rollercoaster
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9431,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# concat normalised X_train's and X_test's

X_train_normalised = pd.concat([X_train_numeric_scaled, X_train_non_numeric_ohe], axis=1)

X_test_normalised = pd.concat([X_test_numeric_scaled, X_test_non_numeric_ohe], axis=1)