In [None]:
# a) Load data
from google.colab import drive
import numpy as np

drive.mount('/content/drive')
with open('/content/drive/My Drive/Colab Notebooks/data/iris.csv', 'r') as f: 
  tmp = np.genfromtxt(f,delimiter=',')
X = tmp[:,:-1]
y = tmp[:,-1] 

In [5]:
# for easier reading np
np.set_printoptions(precision=3,suppress=True)

In [None]:
# b) number of samples, features dimension, the number of classes

label, labelCount = np.unique(y,return_counts=True)

print('num of samples: ', X.shape[0])
print('num of feature dimensions: ', X.shape[1])
print('num of classes:', len(label))
for i in range(len(label)):
  print('class %i has %i samples'%(label[i],labelCount[i]))


In [None]:
# c) check nan, data imputation
from sklearn.impute import KNNImputer

if np.sum(np.isnan(X)):
  print('Total of NaN before imputation:', np.sum(np.isnan(X)))
  imputer = KNNImputer(n_neighbors=2, weights="uniform")
  X1 = imputer.fit_transform(X)
  print('Total of NaN after imputation:', np.sum(np.isnan(X1)))
else:
  X1 = X
  print('no NaN')

In [None]:
# d) partition 80/20
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X1, y, train_size=0.8, shuffle = True, stratify = y);

print('training data size: ', X_train.shape[0])
print('testing data size: ', X_test.shape[0])

In [None]:
# e) standardize
from sklearn.preprocessing import StandardScaler

scalerS = StandardScaler()
X_train_standardized = scalerS.fit_transform(X_train)
print('mean training data in each dimension, after standardization:', np.mean(X_train_standardized, axis=0))
print('std training data in each dimension, after standardization:', np.std(X_train_standardized, axis=0),'\n')

X_test_standardized = scalerS.transform(X_test)
print('mean testing data in each dimension, after standardization:', np.mean(X_test_standardized, axis=0))
print('std testing data in each dimension, after standardization:', np.std(X_test_standardized, axis=0))

In [None]:
# f) k fold
from sklearn.model_selection import KFold

kf = KFold(n_splits=3,shuffle=True, random_state=1)
for i,(train_index, test_index) in enumerate(kf.split(X_train_standardized)):
  print('partition: ',i)
  print('training samples\' class distribution: ', np.unique(y_train[train_index],return_counts=True)[1])
  print('validation samples\' class distribution: ', np.unique(y_train[test_index],return_counts=True)[1], '\n')

In [None]:
drive.flush_and_unmount()