In [38]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd

music_df = pd.read_csv('music.csv', index_col = 0)
music_dummies = pd.get_dummies(music_df['genre'], drop_first=True)

#music_dummies.head()
music_dummies = pd.concat([music_df, music_dummies], axis = 1)
music_dummies = music_dummies.drop('genre', axis=1)
#music_dummies.head()
print(music_dummies.columns)

#from sklearn.model_selection import cross_val_score, KFold
#from sklearn.linear_model import LinearRegression

X = music_dummies.drop('popularity', axis=1).values
y = music_dummies['popularity'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,
                                                   random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
linreg = LinearRegression()
linreg_cv = cross_val_score(linreg, X_train, y_train, cv=kf,
                           scoring='neg_mean_squared_error')

print(np.sqrt(-linreg_cv))


Index(['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'Anime', 'Blues', 'Classical', 'Country', 'Electronic',
       'Hip-Hop', 'Jazz', 'Rap', 'Rock'],
      dtype='object')
[8.15810501 8.63114581 7.52281687 8.62016985 7.91296943]


In [39]:
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

X_cat = music_df["genre"].values.reshape(-1, 1)
X_num = music_df.drop(["genre", "popularity"], axis=1).values
y = music_df["popularity"].values
X_train_cat, X_test_cat, y_train, y_test = train_test_split(X_cat, y, test_size=0.2,
                                               random_state=12)
X_train_num, X_test_num, y_train, y_test = train_test_split(X_num, y, test_size=0.2,
                                               random_state=12)

imp_cat = SimpleImputer(strategy="most_frequent")
X_train_cat = imp_cat.fit_transform(X_train_cat)
X_test_cat = imp_cat.transform(X_test_cat)

le = LabelEncoder()
X_train_cat = le.fit_transform(X_train_cat)
X_test_cat = le.transform(X_test_cat)
X_train_cat = X_train_cat.reshape(-1, 1)
X_test_cat = X_test_cat.reshape(-1, 1)

imp_num = SimpleImputer()
X_train_num = imp_num.fit_transform(X_train_num)
X_test_num = imp_num.transform(X_test_num)

X_train = np.append(X_train_num, X_train_cat, axis=1)
X_test = np.append(X_test_num, X_test_cat, axis=1)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(knn.score(X_test, y_test))


0.015


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [40]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
music_df = pd.read_csv('music_unclean.csv', index_col = 0)
music_df = music_df.dropna(subset=['genre','popularity','loudness','liveness','tempo'])
music_df['genre'] = np.where(music_df['genre'] == 'Rock', 1, 0)
X = music_df.drop('genre', axis = 1).values
y = music_df['genre'].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

#imp = SimpleImputer()
#logres = LogisticRegression()
#steps = [('imputationâ€™, imp), ('Log_reg', logres)]

steps = [('imputation', SimpleImputer()),
         ('Log_reg', LogisticRegression(max_iter = 100000))]

pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(pipeline.score(X_test,y_test))


[[105  30]
 [ 11 122]]
0.8470149253731343


In [41]:
music_df = pd.read_csv('music_unclean.csv', index_col = 0)
#print(music_df.isna().sum().sort_values())

music_df = music_df.dropna(subset=["genre", "popularity", "loudness", "liveness", "tempo"])

print(music_df.isna().sum().sort_values())
print("Shape of the `music_df`: {}".format(music_df.shape))

# Convert genre to a binary feature
music_df["genre"] = np.where(music_df["genre"] == "Rock", 1, 0)

X = music_df.drop('genre', axis = 1).values
y = music_df['genre'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
# Instantiate an imputer
imputer = SimpleImputer()
X_train = imputer.fit_transform(X_train) ###################
X_test = imputer.transform(X_test)          ###################

# Instantiate a knn model
knn = KNeighborsClassifier(n_neighbors=3)

# Build steps for the pipeline
steps = [("imputer", imputer), 
         ("knn", knn)]

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Print the confusion matrix

print(confusion_matrix(y_test, y_pred))
print(pipeline.score(X_test,y_test))

#print(X_train.shape)
columns = ['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'genre']
check = pd.DataFrame(X_train, columns = columns)
print(check.isna().sum().sort_values())


popularity            0
loudness              0
liveness              0
tempo                 0
genre                 0
duration_ms          29
instrumentalness     29
speechiness          53
danceability        127
valence             127
energy              178
acousticness        178
dtype: int64
Shape of the `music_df`: (892, 12)
[[78 57]
 [55 78]]
0.582089552238806
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
genre               0
dtype: int64


In [42]:
# Repeat task from cell 4 WITHOUT using pipeline
# AND find best K with test_size = 0.2 and 0.3

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Load and prepare data (same as cell 4)
music_df = pd.read_csv('music_unclean.csv', index_col = 0)
music_df = music_df.dropna(subset=["genre", "popularity", "loudness", "liveness", "tempo"])
music_df["genre"] = np.where(music_df["genre"] == "Rock", 1, 0)

X = music_df.drop('genre', axis = 1).values
y = music_df['genre'].values

# Test different K values and test sizes
print("Finding best K for different test sizes...\n")

test_sizes = [0.2, 0.3]
k_values = range(1, 20)  # Test K from 1 to 19

for test_size in test_sizes:
    print(f"Test size: {test_size}")
    
    # Split's data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Impute missing values (without pipeline)
    imputer = SimpleImputer()
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    
    # Test different K values
    best_k = None
    best_score = 0
    best_confusion = None
    
    for k in k_values:
        # Instantiate and fit KNN (without pipeline)
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train_imputed, y_train)
        
        # Make predictions
        y_pred = knn.predict(X_test_imputed)
        
        # Calculate score
        score = knn.score(X_test_imputed, y_test)
        
        # Track best K
        if score > best_score:
            best_score = score
            best_k = k
            best_confusion = confusion_matrix(y_test, y_pred)
    
    print(f"Best K value: {best_k}")
    print(f"Best accuracy: {best_score:.4f}")
    print(f"Confusion matrix for best K:\n{best_confusion}")
    print()

# Use test_size = 0.3 as in original cell 4
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 1: Impute missing values
imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

print("Missing values in X_train before imputation:", np.isnan(X_train).sum().sum())
print("Missing values in X_train after imputation:", np.isnan(X_train_imputed).sum().sum())

# Step 2: Train KNN model
knn = KNeighborsClassifier(n_neighbors=3)  # Same as cell 4
knn.fit(X_train_imputed, y_train)

# Step 3: Make predictions
y_pred = knn.predict(X_test_imputed)

# Step 4: Print results
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(f"\nModel Accuracy: {knn.score(X_test_imputed, y_test):.4f}")

# Verify no missing values remain
check = pd.DataFrame(X_train_imputed, columns=[
    'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
    'valence', 'genre'
])
print(f"\nMissing values in each column after imputation:")
print(check.isna().sum().sort_values())

Finding best K for different test sizes...

Test size: 0.2
Best K value: 1
Best accuracy: 0.6145
Confusion matrix for best K:
[[52 42]
 [27 58]]

Test size: 0.3
Best K value: 1
Best accuracy: 0.5896
Confusion matrix for best K:
[[72 63]
 [47 86]]

Missing values in X_train before imputation: 501
Missing values in X_train after imputation: 0

Confusion Matrix:
[[78 57]
 [55 78]]

Model Accuracy: 0.5821

Missing values in each column after imputation:
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
genre               0
dtype: int64
