In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from faker import Faker
fake = Faker()

def gaussian_distribute(sample, mean, var, clip_min, clip_max, around):
    x = np.random.normal(mean, var, sample)
    x = np.clip(x, clip_min, clip_max)
    x = np.around(x, decimals=around)
    return x

def rand_distribute(sample, scale, around):
    x = np.random.rand(sample)
    x = x*scale
    x = np.around(x, decimals=around)
    return x
def encode_label(x, le=None, enc=None):
    if le is None:
        le = preprocessing.LabelEncoder()
        X_ = le.fit_transform(x)
    else:
        X_ = le.transform(x)
        
    X_ = X_.reshape(-1,1)
    
    if enc is None:
        enc = OneHotEncoder()
        X_ = enc.fit_transform(X_)
    else:
        X_ = enc.transform(X_)   
    
    return X_.toarray(), le, enc

def convert_raw_data(data):
    GPA = np.array([data['GAP']])
    semester = np.array([data['semester']])
    ctxh = np.array([data['ctxh']])
    
    GPA = GPA.reshape(-1,1)
    semester = semester.reshape(-1,1)
    ctxh = ctxh.reshape(-1,1)

    gender, _, _ = encode_label([data['gender']], le_g, enc_g)
    is_ok, _, _ = encode_label([data['is_ok']], le_i, enc_i)
    mutilchoie_5, _, _ = encode_label([data['mutilchoie_5']], le_5, enc_5)
    mutilchoie_3, _, _ = encode_label([data['mutilchoie_3']], le_3, enc_3)
    mutilchoie_2_1, _, _ = encode_label([data['mutilchoie_2_1']], le_21, enc_21)
    mutilchoie_2_2, _, _ = encode_label([data['mutilchoie_2_2']], le_22, enc_22)
    mutilchoie_2_3, _, _ = encode_label([data['mutilchoie_2_3']], le_23, enc_23)

    X_data = np.hstack((GPA,gender))
    X_data = np.hstack((X_data,semester))
    X_data = np.hstack((X_data,gender))
    X_data = np.hstack((X_data,is_ok))
    X_data = np.hstack((X_data,mutilchoie_5))
    X_data = np.hstack((X_data,mutilchoie_3))
    X_data = np.hstack((X_data,mutilchoie_2_1))
    X_data = np.hstack((X_data,mutilchoie_2_2))
    X_data = np.hstack((X_data,mutilchoie_2_3))
    return X_data

import pickle
# save the classifier
with open('./model/model_svm.pkl', 'rb') as fid:
    clf = pickle.load(fid)

with open('./pre_data/label_company.pkl', 'rb') as fid:
    name_company = pickle.load(fid)
    
with open('./pre_data/le_g.pkl', 'rb') as fid:
    le_g = pickle.load(fid)
    
with open('./pre_data/enc_g.pkl', 'rb') as fid:
    enc_g = pickle.load(fid)
    
with open('./pre_data/le_i.pkl', 'rb') as fid:
    le_i = pickle.load(fid)
    
with open('./pre_data/enc_i.pkl', 'rb') as fid:
    enc_i = pickle.load(fid)
    
with open('./pre_data/le_5.pkl', 'rb') as fid:
    le_5 = pickle.load(fid)
    
with open('./pre_data/enc_5.pkl', 'rb') as fid:
    enc_5 = pickle.load(fid)
    
with open('./pre_data/le_3.pkl', 'rb') as fid:
    le_3 = pickle.load(fid)
    
with open('./pre_data/enc_3.pkl', 'rb') as fid:
    enc_3 = pickle.load(fid)
    
with open('./pre_data/le_21.pkl', 'rb') as fid:
    le_21 = pickle.load(fid)
    
with open('./pre_data/enc_21.pkl', 'rb') as fid:
    enc_21 = pickle.load(fid)
    
with open('./pre_data/le_22.pkl', 'rb') as fid:
    le_22 = pickle.load(fid)
    
with open('./pre_data/enc_22.pkl', 'rb') as fid:
    enc_22 = pickle.load(fid)
    
with open('./pre_data/le_23.pkl', 'rb') as fid:
    le_23 = pickle.load(fid)
    
with open('./pre_data/enc_23.pkl', 'rb') as fid:
    enc_23 = pickle.load(fid)

data_raw = {'GAP': 6.5, 'is_ok': 'no','gender': 'male', 'semester': 3, 'ctxh': 7, 'mutilchoie_5': 'very_good',
            'mutilchoie_3': 2, 'mutilchoie_2_1' : 'yes', 'mutilchoie_2_2': 'yes',
            'mutilchoie_2_3': 'no'}
data_test = convert_raw_data(data_raw)
clf.predict_proba(data_test)
index_com = data_test[0].argsort()[-5:][::-1]
name_company[index_com]

array([u'Mark Blake', u'Kendra Morris', u'Daniel Trujillo',
       u'Aaron Thompson', u'Mr. Nicholas Rodriguez DDS'], 
      dtype='<U26')

In [11]:
data_raw = {'GAP': 0.5, 'is_ok': 'yes','gender': 'female', 'semester': 3, 'ctxh': 25, 'mutilchoie_5': 'very_good',
            'mutilchoie_3': 2, 'mutilchoie_2_1' : 'yes', 'mutilchoie_2_2': 'no',
            'mutilchoie_2_3': 'yes'}
data_test = convert_raw_data(data_raw)
clf.predict_proba(data_test)
index_com = data_test[0].argsort()[-5:][::-1]
name_company[index_com]

array([u'Kendra Morris', u'Dennis Jenkins', u'Tracy Golden Jr.',
       u'Elizabeth Smith', u'Brian Miller'], 
      dtype='<U26')