In [1]:
# Pick up package from parent folder
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(
    inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import norm
import sklearn
from sklearn import metrics
import time

import gensim.downloader as model_api

import ordinal
from ordinal import OrderedProbitRanker
from ordinal import logit

import AppReviews # generate review data from app store
import docembedding

In [9]:
from sklearn.datasets import make_blobs
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

X, y = make_blobs(n_samples=300, random_state=42)
X, y = shuffle(X, y, random_state=7)
X = StandardScaler().fit_transform(X)

opr = OrderedProbitRanker(method='L-BFGS-B', use_grad=False)
opr.fit(X, y)

ymasks = np.array([np.array(y == c_) for c_ in opr.classes_])
betas = np.concatenate([opr.cuts_, opr.coef_])

print(opr._ordered_probit_loss_and_grad(betas, ymasks, X))
print(opr._orderedProbitLogLike(betas, ymasks, X))
print(betas)
print(opr.score(X, y))

(3.284942988976365e-06, array([ 4.11417497e-06, -3.22979281e-06, -7.48958864e-06, -2.69568096e-06]))
3.284942988976365e-06
[-18.27416671  15.51798264   3.08541305 -26.53379904]
1.0


In [4]:
n_samples = ymasks.shape[1]
n_cuts = ymasks.shape[0] - 1
# TODO: xb can "explode" out of normal cdf bounds
#       eg. values above 8 and below -8 have cdf of 0 and 1 
#       regardless of cutoff point
xb = X @ betas[n_cuts:]
# ensure cutpoints remain ordered
# TODO: This can be done by reparametrizing the cutpoints...
cuts = np.sort(betas[:n_cuts])
# cdf up to cutpoints
cdf_areas = [norm.cdf(ct - xb) for ct in cuts]
# last cdf area is from last cutpoint on
cdf_areas.append(cdf_areas[-1])
cdf_areas = np.array(cdf_areas)
# pdf areas between cutpoints = cdf[i] - cdf[i-1]
pdf_areas = np.empty_like(ymasks, dtype='float')
# first is cdf[cut_0] - 0
pdf_areas[0] = cdf_areas[0]
# last is 1 - cdf[last_cut]
pdf_areas[-1] = 1 - cdf_areas[-1]
# middle cuts are cdf area between each
for i in range(1, n_cuts):
    pdf_areas[i] = cdf_areas[i] - cdf_areas[i-1]
res = np.zeros(n_samples)
for i in range(len(ymasks)):
    res += (ymasks[i] * pdf_areas[i])
res = np.sum(np.log(res))
res

-3.284942988976365e-06

In [5]:
from sklearn.datasets import load_boston, load_diabetes, load_linnerud
from sklearn.preprocessing import KBinsDiscretizer

N_CLASSES = 6

boston = load_boston()
X = boston['data']
y = boston['target']
kbd = KBinsDiscretizer(n_bins=N_CLASSES, encode='ordinal', strategy='kmeans')
y = kbd.fit_transform(y.reshape(-1, 1)).flatten().astype(np.int32)

opr = OrderedProbitRanker(method='nelder-mead', use_grad=False)
opr.fit(X, y)

ymasks = np.array([np.array(y == c_) for c_ in opr.classes_])
betas = np.concatenate([opr.cuts_, opr.coef_])

print(opr._ordered_probit_loss_and_grad(betas, ymasks, X))
print(opr._orderedProbitLogLike(betas, ymasks, X))
print(betas)
print(opr.score(X, y))

  options={"disp":True, 'maxiter':50000, "maxfun":150000})


Optimization terminated successfully.
         Current function value: 484.697248
         Iterations: 7204
         Function evaluations: 9189
(484.69724782927943, array([ 2.72691938e-01, -2.16137996e-01, -2.65049319e-01, -9.26084054e-02,
       -6.56931209e+01, -3.73561284e+03, -5.47352571e+03, -7.48491604e+03,
        1.11983610e+01, -3.64042603e+02, -4.51658021e+03, -7.20604206e+04,
       -3.44954919e+03, -7.62543900e+03, -2.88667046e+05, -1.11560617e+04,
       -2.13973401e+05, -1.47182366e+04]))
484.69724782927943
[-3.62323115e+00 -1.80464505e+00 -2.60133094e-02  8.97345799e-01
  1.78909737e+00 -4.07535387e-02  1.51953928e-02 -3.96980741e-02
  2.38329471e-01 -9.00211940e-02  6.05754052e-01 -9.84447896e-03
 -4.32566398e-01  5.54392463e-02 -3.62701496e-03 -2.97064989e-02
  2.44484279e-03 -1.58934117e-01]
0.6086956521739131


In [6]:
n_samples = ymasks.shape[1]
n_cuts = ymasks.shape[0] - 1
# TODO: xb can "explode" out of normal cdf bounds
#       eg. values above 8 and below -8 have cdf of 0 and 1 
#       regardless of cutoff point
xb = X @ betas[n_cuts:]
# ensure cutpoints remain ordered
# TODO: This can be done by reparametrizing the cutpoints...
cuts = np.sort(betas[:n_cuts])
# cdf up to cutpoints
cdf_areas = [norm.cdf(ct - xb) for ct in cuts]
# last cdf area is from last cutpoint on
cdf_areas.append(cdf_areas[-1])
cdf_areas = np.array(cdf_areas)
# pdf areas between cutpoints = cdf[i] - cdf[i-1]
pdf_areas = np.empty_like(ymasks, dtype='float')
# first is cdf[cut_0] - 0
pdf_areas[0] = cdf_areas[0]
# last is 1 - cdf[last_cut]
pdf_areas[-1] = 1 - cdf_areas[-1]
# middle cuts are cdf area between each
for i in range(1, n_cuts):
    pdf_areas[i] = cdf_areas[i] - cdf_areas[i-1]
res = np.zeros(n_samples)
for i in range(len(ymasks)):
    res += (ymasks[i] * pdf_areas[i])
res = np.sum(np.log(res))
res

-484.69724782927943

In [7]:
raise ValueError()

ValueError: 

In [None]:
dft = pd.read_csv("cache.csv")
X = dft.drop('rating', 1)
y = dft.rating

opr = OrderedProbitRanker(method='L-BFGS-B', use_grad=False)
opr.fit(X, y)

ymasks = np.array([np.array(y == c_) for c_ in opr.classes_])
betas = np.concatenate([opr.cuts_, opr.coef_])

print(opr._ordered_probit_loss_and_grad(betas, ymasks, X))
print(opr._orderedProbitLogLike(betas, ymasks, X))
print(opr.score(X, y))

In [None]:
raise ValueError()

In [None]:
df = pd.read_csv("app_reviews.csv")
df.rating = df.rating.astype(int)
df.head(5)

# Training prepro

In [None]:
model = model_api.load("glove-wiki-gigaword-300")

dft = df.join(pd.get_dummies(df.app_name, drop_first=True, dummy_na=True))

# Add embedding components
for col in ['title', 'review']:
    tokens = docembedding.stringprocessing.tokenize(dft['review'], lower=True, split=True)
    weights = docembedding.embedding.getWordWeights(tokens, "tf-idf")
    embeds = docembedding.embedding.sentenceEmbedding(tokens, model, weights)
    embeds = pd.DataFrame(embeds)
    embeds.columns = [col + str(colnum) for colnum in embeds.columns]
    dft = dft.join(embeds)
dft = dft.drop(['version', 'vote_count', 'review', 'title', 'app_name'], 1)
dft.head(3)

In [None]:
X = dft.drop('rating', 1)
y = dft.rating

In [None]:
def fitscore(model, X, y):
    start = time.time()
    model.fit(X, y)
    pred_val = model.predict(X)
    print("time: ", time.time() - start)
    print("score: ", metrics.accuracy_score(pred_val, y))
    print("mse: ", metrics.mean_squared_error(pred_val, y))
    print("mae: ", metrics.mean_absolute_error(pred_val, y))
    pd.Series(pred_val).hist(bins=5)

In [None]:
y.hist(bins=5)

In [None]:
lr = sklearn.linear_model.LogisticRegression(C=9999)
fitscore(lr, X, y)

In [None]:
lat = mord.LogisticAT(alpha=1, verbose=1, max_iter=10000000)
fitscore(lat, X, y)

In [None]:
lit = mord.LogisticIT(alpha=1, verbose=1, max_iter=10000000)
fitscore(lit, X, y)

In [None]:
lse = mord.LogisticSE(alpha=1, verbose=1, max_iter=10000000)
fitscore(lse, X, y)

In [None]:
opr = OrderedProbitRanker(method='L-BFGS-B', use_grad=False)
fitscore(opr, X, y)

# TESTING

In [None]:
opr = OrderedProbitRanker(method='L-BFGS-B', use_grad=False)
start = time.time()
opr.fit(X, y)
print("time: ", time.time() - start)
print("score: ", opr.score(X, y))
print("cuts: ", opr.cuts_)

In [None]:
n_cuts = len(opr.cuts_)
n_samples = len(X)
betas = opr.coef_

xb = X @ opr.coef_
ymasks = np.array([np.array(y == c_) for c_ in opr.classes_])
cdf_areas = [norm.cdf(ct - xb) for ct in opr.cuts_]
cdf_areas.append(cdf_areas[-1])
cdf_areas = np.array(cdf_areas)
pdf_areas = np.zeros_like(ymasks, dtype='float')
pdf_areas[0] = cdf_areas[0]
pdf_areas[-1] = 1 - cdf_areas[-1]
for i in range(1, n_cuts):
    pdf_areas[i] = cdf_areas[i] - cdf_areas[i-1]
res = np.zeros(n_samples)
for i in range(len(ymasks)):
    res += (ymasks[i] * pdf_areas[i])

In [None]:
out = pd.Series(res)
ymk = pd.DataFrame(ymasks.T)
pdf = pd.DataFrame(pdf_areas.T)
cdf = pd.DataFrame(cdf_areas.T)
bad = (out == 0)