In [33]:
import numpy as np

# Preprocessing

In [10]:
data_read = []
with open('yearpredictionmsd/YearPredictionMSD.txt', 'r') as file:
    lines = file.readlines()
data_read = [line.strip().split(",") for line in lines]

In [15]:
# Validate that feature matches the specs
print("Total number of instance ", len(data_read))
print("Total number of features ", len(data_read[0][1:]))

Total number of instance  515345
Total number of features  90


### Checking for any missing attributes -> each instance should have 91 columns, index 0 matches year, index 1-89 matches features

In [20]:
anomalies = []
for index, instance in enumerate(data_read):
    if (len(instance) != 91):
        anomalies.append(index)
print("No Anomalies detected: ", len(anomalies) == 0)

No Anomalies detected:  True


### Breakdown data into training and testing, first 463,715 to be training via data set instructions. Separate year with features, converting types.

In [34]:
training = np.array(data_read[:463715])
testing = np.array(data_read[463715:])

In [29]:
print(f"length of training {len(training)}, length of testing {len(testing)}, total {len(data_read)}")

length of training 463715, length of testing 51630, total 515345


In [54]:
def data_cleaning(data):
    years = data[:, 0]
    years = [int(year) for year in years]
    features = data[:, 1:]
    features = [np.double(feature) for feature in features]
    return np.array(years), np.array(features)


In [55]:
training_year, training_features = data_cleaning(training)
testing_year, testing_features = data_cleaning(testing)

### For Training Data, first normalize the attributes. 12 average timbre features / 78 timbre covariance features. Normalize each set first, then combine

In [56]:
from sklearn.preprocessing import StandardScaler
training_avg = training_features[:,:12]
training_cov = training_features[:,12:]


training_scale_avg = StandardScaler()
training_scale_cov = StandardScaler()

training_avg_norm = training_scale_avg.fit_transform(training_avg)
training_cov_norm = training_scale_cov.fit_transform(training_cov)
training_features_norm = np.hstack((training_avg_norm, training_cov_norm))
print(training_features_norm.shape)

(463715, 90)
