In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score


from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [2]:
def test_type_NN(type_1, type_2, type_3, hidden_layers, TEST_SIZE=0.2, resample=None):
    df = pd.read_csv("pokemon_text_cleaned.csv")
    subset = pd.DataFrame({'DexNumber':df['DexNumber'], 'Name':df['Name'], 'FlavorText':df['FlavorText']})

    RAND_STATE = 13
    MAX_ITERATIONS = 700

    if (type_1 == type_2) | (type_1 == type_3) | (type_2 == type_3):
        return "Error: Input types must be different!"

    classes = [type_1, type_2, type_3]

    for i in classes:
        subset[i] = df[i]

    test_array = np.zeros_like(np.asarray(subset[classes[0]]))
    for i,name in enumerate(classes):
        test_array = test_array + np.asarray(subset[name])
    subset = subset[pd.Series(test_array) == 1]

    target_array = np.zeros_like(np.asarray(subset[classes[0]]))
    for i,name in enumerate(classes):
        target_array = target_array + (i)*np.asarray(subset[name])
    
    features = np.asarray(subset['FlavorText'])
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(features)

    X_train, X_test, y_train, y_test = train_test_split(embeddings, target_array, random_state=RAND_STATE, test_size=TEST_SIZE, stratify=target_array)

    if resample == 'Over':
        print('Over')
        resampler = RandomOverSampler(random_state=RAND_STATE)
        X_train, y_train = resampler.fit_resample(X_train, y_train)
    elif resample == 'Under':
        print('Under')
        resampler = RandomUnderSampler(random_state=RAND_STATE)
        X_train, y_train = resampler.fit_resample(X_train, y_train)
    else:
        print('None')

    output = []
    for layer in hidden_layers:
        print(layer)
        classifier = MLPClassifier(random_state=RAND_STATE, hidden_layer_sizes=layer, max_iter=MAX_ITERATIONS).fit(X_train, y_train)
        y_predict = classifier.predict(X_test)
        acc = accuracy_score(y_test,y_predict)
        bal_acc = balanced_accuracy_score(y_test,y_predict)
        f1_s = f1_score(y_test,y_predict, average='weighted')
        output.append((layer, acc, bal_acc, f1_s))

    return output

In [3]:
def test_type_NB(type_1, type_2, type_3, TEST_SIZE=0.2, resample=None):
    df = pd.read_csv("pokemon_text_cleaned.csv")
    subset = pd.DataFrame({'DexNumber':df['DexNumber'], 'Name':df['Name'], 'FlavorText':df['FlavorText']})

    RAND_STATE = 13

    if (type_1 == type_2) | (type_1 == type_3) | (type_2 == type_3):
        return "Error: Input types must be different!"

    classes = [type_1, type_2, type_3]

    for i in classes:
        subset[i] = df[i]

    test_array = np.zeros_like(np.asarray(subset[classes[0]]))
    for i,name in enumerate(classes):
        test_array = test_array + np.asarray(subset[name])
    subset = subset[pd.Series(test_array) == 1]

    target_array = np.zeros_like(np.asarray(subset[classes[0]]))
    for i,name in enumerate(classes):
        target_array = target_array + (i)*np.asarray(subset[name])
    
    features = np.asarray(subset['FlavorText'])
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(features)

    X_train, X_test, y_train, y_test = train_test_split(embeddings, target_array, random_state=RAND_STATE, test_size=TEST_SIZE, stratify=target_array)

    if resample == 'Over':
        print('Over')
        resampler = RandomOverSampler(random_state=RAND_STATE)
        X_train, y_train = resampler.fit_resample(X_train, y_train)
    elif resample == 'Under':
        print('Under')
        resampler = RandomUnderSampler(random_state=RAND_STATE)
        X_train, y_train = resampler.fit_resample(X_train, y_train)
    else:
        print('None')
    
    classifier = GaussianNB().fit(X_train, y_train)
    y_predict = classifier.predict(X_test)
    acc = accuracy_score(y_test,y_predict)
    bal_acc = balanced_accuracy_score(y_test,y_predict)
    f1_s = f1_score(y_test,y_predict, average='weighted')
    return (acc, bal_acc, f1_s)

In [4]:
def test_type_LogReg(type_1, type_2, type_3, c_vals, TEST_SIZE=0.2, resample=None):
    df = pd.read_csv("pokemon_text_cleaned.csv")
    subset = pd.DataFrame({'DexNumber':df['DexNumber'], 'Name':df['Name'], 'FlavorText':df['FlavorText']})

    RAND_STATE = 13
    MAX_ITERATIONS = 700

    if (type_1 == type_2) | (type_1 == type_3) | (type_2 == type_3):
        return "Error: Input types must be different!"

    classes = [type_1, type_2, type_3]

    for i in classes:
        subset[i] = df[i]

    test_array = np.zeros_like(np.asarray(subset[classes[0]]))
    for i,name in enumerate(classes):
        test_array = test_array + np.asarray(subset[name])
    subset = subset[pd.Series(test_array) == 1]

    target_array = np.zeros_like(np.asarray(subset[classes[0]]))
    for i,name in enumerate(classes):
        target_array = target_array + (i)*np.asarray(subset[name])
    
    features = np.asarray(subset['FlavorText'])
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(features)

    X_train, X_test, y_train, y_test = train_test_split(embeddings, target_array, random_state=RAND_STATE, test_size=TEST_SIZE, stratify=target_array)

    if resample == 'Over':
        print('Oversample')
        resampler = RandomOverSampler(random_state=RAND_STATE)
        X_train, y_train = resampler.fit_resample(X_train, y_train)
    elif resample == 'Under':
        print('Undersample')
        resampler = RandomUnderSampler(random_state=RAND_STATE)
        X_train, y_train = resampler.fit_resample(X_train, y_train)
    else:
        print('None')
        

    output = []
    for C_VAL in c_vals:
        print(C_VAL)
        classifier = LogisticRegression(max_iter=MAX_ITERATIONS, random_state=RAND_STATE, solver='sag', C=C_VAL).fit(X_train, y_train)
        y_predict = classifier.predict(X_test)
        acc = accuracy_score(y_test,y_predict)
        bal_acc = balanced_accuracy_score(y_test,y_predict)
        f1_s = f1_score(y_test,y_predict, average='weighted')
        output.append((C_VAL, acc, bal_acc, f1_s))

    return output

In [5]:
type_1 = 'Grass'
type_2 = 'Water'
type_3 = 'Fire'
NUM_FOLDS = 5
resample = 'Over'

C_vals = [1,2,4,8,16,32]
hidden_layers = [[300],[400],[500],[600],[700]]
# for i in range(3,8):
#     for j in range(3,8):
#         hidden_layers.append([i*100,j*100])

cv1 = test_type_LogReg(type_1, type_2, type_3,C_vals,resample=resample)
cv2 = test_type_NB(type_1, type_2, type_3,resample=resample)
cv3 = test_type_NN(type_1, type_2, type_3,hidden_layers,resample=resample)

Oversample
1
2
4
8
16
32
Over
Over
[300]
[400]
[500]
[600]
[700]


In [6]:
print('Logistic Regression')
print(' ')
for out in cv1:
    print('C = ' + str(out[0]))
    print('F1')
    print(f"{out[3]:.4f}")
    print('Balanced Accuracy')
    print(f"{out[2]:.4f}")
    print(' ')

Logistic Regression
 
C = 1
F1
0.8457
Balanced Accuracy
0.8401
 
C = 2
F1
0.8523
Balanced Accuracy
0.8476
 
C = 4
F1
0.8471
Balanced Accuracy
0.8424
 
C = 8
F1
0.8573
Balanced Accuracy
0.8533
 
C = 16
F1
0.8503
Balanced Accuracy
0.8454
 
C = 32
F1
0.8463
Balanced Accuracy
0.8372
 


In [7]:
print('Naive Bayes')
print(' ')
print('F1')
print(f"{out[2]:.4f}")
print('Balanced Accuracy')
print(f"{out[1]:.4f}")

Naive Bayes
 
F1
0.8372
Balanced Accuracy
0.8471


In [8]:
print('Neural Network')
print(' ')
for out in cv3:
    print('Hidden Layers = ' + str(out[0]))
    print('F1')
    print(f"{out[3]:.4f}")
    print('Balanced Accuracy')
    print(f"{out[2]:.4f}")
    print(' ')

Neural Network
 
Hidden Layers = [300]
F1
0.8898
Balanced Accuracy
0.8825
 
Hidden Layers = [400]
F1
0.8863
Balanced Accuracy
0.8771
 
Hidden Layers = [500]
F1
0.8881
Balanced Accuracy
0.8807
 
Hidden Layers = [600]
F1
0.8932
Balanced Accuracy
0.8834
 
Hidden Layers = [700]
F1
0.8846
Balanced Accuracy
0.8753
 
