In [75]:
import json
import random
from pathlib import Path

import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2
Path.ls = lambda x: list(x.iterdir())
# from urllib.parse import quote_plus

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
# Text Classification with TF-NB-SVM

In [143]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit

In [144]:
file_path = Path("../data/tagged/Uber_5K_Tags_Verified.csv")
assert file_path.exists()
raw_df = pd.read_csv(file_path)
raw_df.head()

Unnamed: 0,review,labels,scores,tags
0,I’ve used Uber a long time and have been able ...,"['Pickup', 'Advance Ride Booking', 'Price', 'C...","[0.8712594509124756, 0.6643384695053101, 0.533...",['Pickup']
1,I heard so many good stories about people usin...,"['Customer Support', 'Price', 'Advance Ride Bo...","[0.6645472645759583, 0.506709098815918, 0.3678...",['Customer Support']
2,If you need a cheap ride and you need to get s...,"['Advance Ride Booking', 'Cancellation Fee', '...","[0.7515338063240051, 0.6101875901222229, 0.565...","['Advance Ride Booking', 'Cancellation Fee']"
3,"Uber is Uber, it’s as good as lyft but pricier...","['Customer Support', 'Price', 'Cancellation Fe...","[0.9570878744125366, 0.8025601506233215, 0.491...","['Customer Support', 'Price']"
4,"When deciding wether to use Uber, Lyft or take...","['Pickup', 'Advance Ride Booking', 'Card Issue...","[0.9557592272758484, 0.6880117654800415, 0.563...","['Pickup', 'Advance Ride Booking']"


In [145]:
REVIEW, TAGS = "review", "tags"
df = raw_df[[REVIEW, TAGS]]
del raw_df

In [146]:
import ast
from typing import List

def get_first_tag(
    tag_input: str,
    valid_tags: List[str] = [
        "Pickup",
        "Customer Support",
        "Cancellation Fee",
        "Pool",
        "Price",
        "Card Issues",
    ],
):
    try:
        tag_input = tag_input.strip()
    except AttributeError as ae:
        return None
    if len(tag_input) < 1:
        return None
    try:
        tag = ast.literal_eval(tag_input)[0]
        if tag not in valid_tags:
            return None
        return tag
    except SyntaxError as e:
        return None


df[TAGS] = df[TAGS].apply(get_first_tag)

In [147]:
sum(df[TAGS].isna())

593

In [148]:
df.fillna("Unknown", inplace=True)

In [149]:
df.tags.unique()

array(['Pickup', 'Customer Support', 'Unknown', 'Cancellation Fee',
       'Pool', 'Price', 'Card Issues'], dtype=object)

In [165]:
def binarize_labels(df: pd.DataFrame, target_label: str):
    def match_label(sample_label) -> int:

        if sample_label == target_label:
            return 1
        return 0

    return df[TAGS].apply(match_label)

In [166]:
df['y'] = binarize_labels(df, target_label="Pickup")

In [167]:
df

Unnamed: 0,review,tags,y
0,I’ve used Uber a long time and have been able ...,Pickup,1
1,I heard so many good stories about people usin...,Customer Support,0
2,If you need a cheap ride and you need to get s...,Unknown,0
3,"Uber is Uber, it’s as good as lyft but pricier...",Customer Support,0
4,"When deciding wether to use Uber, Lyft or take...",Pickup,1
...,...,...,...
4995,Jose was awesome! I absolutely love my new sc...,Unknown,0
4996,"I use Uber a lot, but a few days ago my boyfri...",Pickup,1
4997,My wife and I were leaving LAS Thursday night ...,Cancellation Fee,0
4998,I was outside my location for 10 minutes and w...,Price,0


In [164]:
from nbsvm import NBSVMClassifier

In [128]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

In [130]:
for train_index, test_index in sss.split(df[REVIEW], df[TAG]):
    pass
#     print(f"TRAIN:{train_index}, TEST: {test_index}")
train, test = df.iloc[train_index], df.iloc[test_index]

In [118]:
"""
I hack together a separated tokenizer after removing punctuation isntead of spaCy tokenizer. 
This is because spaCy used linguistic rules, which in turn, depend on punctuation to make it work correctly. 
Alternativel, I could've tokenized using spaCy and dropped the punctuation tokens from the result
"""
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [131]:
n = train.shape[0]
vec = TfidfVectorizer(
    ngram_range=(1, 2),
    tokenizer=tokenize,
    min_df=3,
    max_df=0.9,
    strip_accents="unicode",
    use_idf=1,
    smooth_idf=1,
    sublinear_tf=1,
)

In [134]:
%%time
trn_term_doc = vec.fit_transform(train[REVIEW])

Wall time: 1.91 s


In [135]:
%%time
test_term_doc = vec.transform(test[REVIEW])

Wall time: 416 ms


In [136]:
def pr(y_i, y):
    """
    Basic Naive Bayes Equation
    """
    p = x[y == y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

In [137]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [138]:
preds = np.zeros((len(test), len(df[TAGS].unique())))

In [139]:
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])