## Rafael Espinosa Mena
USC ID: 3587389751 <br>
GitHub Username: rafael6423 <br>
DSCI 552 HW5 <br>
Jun 26, 2022

In [1]:
import numpy as np
import pandas as pd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, silhouette_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from scipy.spatial.distance import hamming
from tabulate import tabulate
import time

#### 1.a) Load Data and Separate it into Training and Test Set

In [2]:
# read data, dropping RecordID as it is not useful
data = pd.read_csv('../input/data-hw7/Frogs_MFCCs.csv').drop(['RecordID'], axis=1)

# separate data into predictors and target variables
x_data = data.iloc[:,:-3]
y_data = data.iloc[:,-3:]

# chose 70% of the data randomly as the training set
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=0)

# start recording time out of curiosity
start = time.time()

#### 1.b.ii) Gaussian Kernel, One vs All SVM with 10-Fold CV

In [3]:
def Gaussian_SVMClassifier(label):
    y_train_label = pd.DataFrame(y_train[label])
    y_test_label = pd.DataFrame(y_test[label])
    print("Gaussian SVM for", label)

    # make model and find parameters through cv
    model = OneVsRestClassifier(SVC(kernel='rbf', random_state=0, tol=0.01))
    parameters = {"estimator__C": np.logspace(-1,3,15), "estimator__gamma": np.linspace(0.1,5,20)}
    model_cv = GridSearchCV(model, param_grid=parameters, cv=10, n_jobs=-1)
    model_cv.fit(x_train, y_train_label)
    
    print("Best C (SVM Penalty) Found:", model_cv.best_params_['estimator__C'])
    print("Best Gamma (Width of Kernel) Found:", model_cv.best_params_['estimator__gamma'])
    print("CV Score:", model_cv.best_score_)

    # create model with found values for C and gamma and report results
    best_model = SVC(kernel='rbf', random_state=0, tol=0.01, C=model_cv.best_params_['estimator__C'],
                     gamma=model_cv.best_params_['estimator__gamma'])
    best_model.fit(x_train, y_train_label.values.ravel())
    print("Test Exact Match Score is", best_model.score(x_test, y_test_label))

    y_pred = best_model.predict(x_test)
    print("Hamming Loss is", hamming_loss(y_test_label, y_pred))

In [4]:
# Family SVM
Gaussian_SVMClassifier('Family')

Gaussian SVM for Family
Best C (SVM Penalty) Found: 37.27593720314938
Best Gamma (Width of Kernel) Found: 2.936842105263158
CV Score: 0.993247656915649
Test Exact Match Score is 0.9962945808244558
Hamming Loss is 0.0037054191755442334


In [5]:
# Genus SVM
Gaussian_SVMClassifier('Genus')

Gaussian SVM for Genus
Best C (SVM Penalty) Found: 37.27593720314938
Best Gamma (Width of Kernel) Found: 2.1631578947368424
CV Score: 0.9908639433241818
Test Exact Match Score is 0.9921259842519685
Hamming Loss is 0.007874015748031496


In [6]:
# Species SVM
Gaussian_SVMClassifier('Species')

Gaussian SVM for Species
Best C (SVM Penalty) Found: 10.0
Best Gamma (Width of Kernel) Found: 2.678947368421053
CV Score: 0.9904675123860015
Test Exact Match Score is 0.9916628068550255
Hamming Loss is 0.008337193144974525


#### 1.b.iii) L1-Penalized SVMs Using One Vs All and 10-Fold CV

In [7]:
# standarize features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

# function to create and train model using cv
def L1Penalized_SVM(label):
    y_train_label = pd.DataFrame(y_train[label])
    y_test_label = pd.DataFrame(y_test[label])
    print("L1-Penalized Linear SVM for", label)

    # make model and find parameters through cv
    model = OneVsRestClassifier(LinearSVC(penalty = 'l1', random_state=0, tol=0.01, max_iter = 200000, dual=False))
    parameters = {"estimator__C": np.logspace(-1,3,15)}
    model_cv = GridSearchCV(model, param_grid=parameters, cv=10, n_jobs=-1)
    model_cv.fit(x_train, y_train_label)

    print("Best C (SVM Penalty) Found:", model_cv.best_params_['estimator__C'])
    print("CV Score:", model_cv.best_score_)

    # create model with found values for C and gamma and report results
    best_model = SVC(kernel='rbf', random_state=0, tol=0.01, C=model_cv.best_params_['estimator__C'])
    best_model.fit(x_train, y_train_label.values.ravel())
    print("Test Exact Match Score is", best_model.score(x_test, y_test_label))

    y_pred = best_model.predict(x_test)
    print("Hamming Loss is", hamming_loss(y_test_label, y_pred))

In [8]:
# Family L1-Penalized Linear SVM
L1Penalized_SVM('Family')

L1-Penalized Linear SVM for Family
Best C (SVM Penalty) Found: 5.17947467923121
CV Score: 0.9370546561898452
Test Exact Match Score is 0.9944418712366836
Hamming Loss is 0.00555812876331635


In [9]:
# Genus L1-Penalized Linear SVM
L1Penalized_SVM('Genus')

L1-Penalized Linear SVM for Genus
Best C (SVM Penalty) Found: 5.17947467923121
CV Score: 0.9529363501530501
Test Exact Match Score is 0.9925891616489115
Hamming Loss is 0.007410838351088467


In [10]:
# Species L1-Penalized Linear SVM
L1Penalized_SVM('Species')

L1-Penalized Linear SVM for Species
Best C (SVM Penalty) Found: 2.6826957952797246
CV Score: 0.9592898955473508
Test Exact Match Score is 0.9907364520611394
Hamming Loss is 0.009263547938860583


#### SMOTE L1-Penalized Linear SVMs

In [11]:
def SMOTE_L1PenalizedSVM(label):
    # apply smote on dataset
    smote = SMOTE(random_state=0, n_jobs=-1)
    y_train_label = pd.DataFrame(y_train['Family'])
    x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train_label)
    y_test_label = pd.DataFrame(y_test['Family'])
    print("SMOTE L1-Penalized Linear SVM for", label)

    # make model and find parameters through cv
    model = OneVsRestClassifier(LinearSVC(penalty = 'l1', random_state=0, tol=0.01, max_iter = 200000, dual=False))
    parameters = {"estimator__C": np.logspace(-1,3,15)}
    model_cv = GridSearchCV(model, param_grid=parameters, cv=10, n_jobs=-1)
    model_cv.fit(x_train_smote, y_train_smote)

    print("Best C (SVM Penalty) Found:", model_cv.best_params_['estimator__C'])
    print("CV Score:", model_cv.best_score_)

    # create model with found values for C and gamma and report results
    best_model = SVC(kernel='rbf', random_state=0, tol=0.01, C=model_cv.best_params_['estimator__C'])
    best_model.fit(x_train_smote, y_train_smote.values.ravel())
    print("Test Exact Match Score is", best_model.score(x_test, y_test_label))

    y_pred = best_model.predict(x_test)
    print("Hamming Loss is", hamming_loss(y_test_label, y_pred))

In [12]:
# SMOTE Family L1-Penalized Linear SVM
SMOTE_L1PenalizedSVM('Family')

SMOTE L1-Penalized Linear SVM for Family
Best C (SVM Penalty) Found: 37.27593720314938
CV Score: 0.9514232672459165
Test Exact Match Score is 0.9949050486336267
Hamming Loss is 0.005094951366373321


In [13]:
# SMOTE Genus L1-Penalized Linear SVM
SMOTE_L1PenalizedSVM('Genus')

SMOTE L1-Penalized Linear SVM for Genus
Best C (SVM Penalty) Found: 37.27593720314938
CV Score: 0.9514232672459165
Test Exact Match Score is 0.9949050486336267
Hamming Loss is 0.005094951366373321


In [14]:
# SMOTE Species L1-Penalized Linear SVM
SMOTE_L1PenalizedSVM("Species")

SMOTE L1-Penalized Linear SVM for Species
Best C (SVM Penalty) Found: 37.27593720314938
CV Score: 0.9514232672459165
Test Exact Match Score is 0.9949050486336267
Hamming Loss is 0.005094951366373321


#### 2.a) K-Means Clustering With Automatic K Selection Through Silhoutte Method

In [15]:
# function to determine the optimal k
def find_optimal_k(i):
    best_score = -1
    for k in range(2,51):
        km = KMeans(n_clusters=k, random_state=i, tol=0.01)
        km.fit_predict(x_data)
        score = silhouette_score(x_data, km.labels_, metric='euclidean')
        # select the k with best silhouette score
        if score > best_score:
            best_score = score
            best_k = k
            km_best = km
    return km_best, best_k

#### 2.b) Determine Majority Family, Genus or Species in Each Cluster and Assign to Observation

In [16]:
def assign_cluster_to_label(label, km_model, best_k):
    label_list = y_data[label].value_counts().reset_index()['index'].values
    y = pd.DataFrame(y_data[label])
    zero_mat = np.zeros((best_k,len(label_list)),dtype=int)
    df = pd.DataFrame(zero_mat, columns=label_list)
    # Separate observations into true group by reading labels
    for cluster_num in range(0,best_k):
        for i in range(0,len(km_model.labels_)):
            if km_model.labels_[i] == cluster_num:
                df.loc[cluster_num, y.iloc[i,0]] += 1
    cluster_class = pd.DataFrame(np.zeros((best_k,1),dtype=int), columns=['cluster_type'])
    y['predicted'] = 0
    # determine which group is the majority in each cluster
    for cluster_num in range(0,best_k):
        max_col = df.iloc[cluster_num,:].idxmax()
        for i in range(0, len(km_model.labels_)):
            # create predicted labels for each observation based on cluster majority
            if km_model.labels_[i] == cluster_num:
                y.loc[i, 'predicted'] = max_col
    return y

#### 2.c) Calculate Hamming Metrics

In [17]:
# get 3 hamming metrics
def hamming_metrics(label, pred):
    hamming_distance = hamming(pred, y_data[label].values) * len(pred)
    hamming_lo = hamming_loss(pred, y_data[label].values)
    hamming_score = 1 - hamming_lo
    return hamming_distance, hamming_score, hamming_lo

#### Run The Monte Carlo Procedure 50 Times and Report Average Hamming Metrics

In [18]:
# create empty lists to store results
hd_list = []
hs_list = []
hl_list = []
x_data_0 = x_data
y_data_0 = y_data 

# iterate through a list of 50 different random states
np.random.seed(0)
monte_carlo_iterations = np.random.randint(0,2**32 - 1,50)
for iteration in monte_carlo_iterations:
    x_data = x_data_0
    km_best, best_k = find_optimal_k(iteration)
    for label in ['Family', 'Genus', 'Species']:
        y_data = y_data_0
        y_df = assign_cluster_to_label(label, km_best, best_k)
        # calculate and store results for every single iteration
        hd, hs, hl = hamming_metrics(label, y_df['predicted'].values)
        hd_list.append(hd)
        hs_list.append(hs)
        hl_list.append(hl)
        
# report the average and std of every metric
data = [['Hamming Distance', np.array(hd_list).mean(), np.array(hd_list).std()],
     ['Hamming Score', np.array(hs_list).mean(), np.array(hs_list).std()], 
     ['Hamming Loss', np.array(hl_list).mean(), np.array(hl_list).std()]]
col_names = ["Measurement", "Average", "Std"]
print("Monte-Carlo Simulation Results:\n")
print(tabulate(data, headers=col_names))

Monte-Carlo Simulation Results:

Measurement           Average          Std
----------------  -----------  -----------
Hamming Distance  1629.69      214.507
Hamming Score        0.773496    0.0298133
Hamming Loss         0.226504    0.0298133


In [19]:
# record time taken out of curiosity
end = time.time()
print("Time Taken (seconds):", end-start)

Time Taken (seconds): 10408.925993919373
