In [1]:
# load in encoder and model
from joblib import load
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def load_data():
    ''' Load in the pretrained model & label encoders.
    '''
    d = load("label_encoder.joblib.dat")
    d_classes = load("encoder_classes.joblib.dat")
    dialect_classifier = load("dialect_classifier.joblib.dat")
    test_case = load("test_case.joblib.dat")

    # remove target class from test data
    del test_case["class_target"]

    # update the classes for each of our label encoders
    for key,item in d.items():
        d[key]._classes = d_classes[key]

    return d, d_classes, dialect_classifier, test_case

def encode_data(input_data):
    ''' Encode our input data with pre-trained label encoders.
    '''
    # encode our test data
    test_case_encoded = input_data

    for i, row in input_data.items():
        test_case_encoded[i] = d[i].transform([input_data[i]])

    test_case_encoded = test_case_encoded.apply(lambda x:x[0])

    return test_case_encoded

def predict_cities(test_case_encoded):
    ''' Take in encoded data & return top three predicted cities.
    '''
    # convert input data to DMatrix format
    test_case_encoded_d = xgb.DMatrix(test_case_encoded.values.reshape((1,-1)))
    test_case_encoded_d.feature_names =  test_case_encoded.index.tolist()

    # classify using our pre-trained model
    predictions = dialect_classifier.predict(test_case_encoded_d)

    # return the top 3 classes
    top_3 = np.argsort(predictions, axis=1)[ : ,-3 : ]

    cities = d["class_target"].inverse_transform(top_3[0].tolist())

    return cities

In [2]:
#del test_case["class_target"]

In [3]:
"""# update the classes for each of our label encoders
for key,item in d.items():
    d[key]._classes=d_classes[key]
# encode our test data   
test_case_encoded=test_case 
for i,row in test_case.items():
    test_case_encoded[i]=d[i].transform([test_case[i]])
test_case_encoded=test_case_encoded.apply(lambda x:x[0])    
"""   


'# update the classes for each of our label encoders\nfor key,item in d.items():\n    d[key]._classes=d_classes[key]\n# encode our test data   \ntest_case_encoded=test_case \nfor i,row in test_case.items():\n    test_case_encoded[i]=d[i].transform([test_case[i]])\ntest_case_encoded=test_case_encoded.apply(lambda x:x[0])    \n'

In [4]:
"""#convert input data to DMatrix fromat
test_case_encoded_d=xgb.DMatrix(test_case_encoded.values.reshape((1,-1)))
test_case_encoded_d.feature_names=test_case_encoded.index.tolist()
# classify using our pre-trained model
predictions=dialect_classifier.predict(test_case_encoded_d)
"""


'#convert input data to DMatrix fromat\ntest_case_encoded_d=xgb.DMatrix(test_case_encoded.values.reshape((1,-1)))\ntest_case_encoded_d.feature_names=test_case_encoded.index.tolist()\n# classify using our pre-trained model\npredictions=dialect_classifier.predict(test_case_encoded_d)\n'

In [5]:
"""# return top 3 classes
top_3 = np.argsort(predictions, axis=1)[:,-3:]
d['class_target'].inverse_transform(top_3[0].tolist())
"""

"# return top 3 classes\ntop_3 = np.argsort(predictions, axis=1)[:,-3:]\nd['class_target'].inverse_transform(top_3[0].tolist())\n"

In [6]:
d, d_classes, dialect_classifier, test_case=load_data()
test_case_encoded = encode_data(test_case)
predict_cities(test_case_encoded)
# expected output = 'waltham Massachusetts', 'boston Massachusetts','norwalk Connecticut'

array(['waltham Massachusetts', 'boston Massachusetts',
       'norwalk Connecticut'], dtype=object)

In [7]:
d_classes

{'Q01': array(['yinz', 'you', 'you all', 'you guys', 'you lot', 'you ’uns',
        'yous', 'youse'], dtype=object),
 'Q02': array(['doodle bug', 'millipede', 'pill bug', 'potato bug', 'roll-up bug',
        'roly poly', 'sow bug', 'twiddle bug', 'wood louse'], dtype=object),
 'Q03': array(['freeway', 'highway', 'parkway', 'turnpike'], dtype=object),
 'Q04': array(['I have no word for this',
        'I use lightning bug and firefly interchangeably', 'firefly',
        'lightning bug', 'others', 'peenie wallie'], dtype=object),
 'Q05': array(['cougar', 'mountain cat', 'mountain lion', 'panther', 'puma'],
       dtype=object),
 'Q06': array(['gym shoes', 'shoes', 'sneakersn'], dtype=object),
 'Q07': array(['garage sale', 'rummage sale', 'tag sale', 'thrift sale',
        'yard sale'], dtype=object),
 'Q08': array(['Mary and merry are pronounced the same, but marry is different',
        'all three are pronounced differently',
        'all three are pronounced the same',
        'merry an