In [41]:
import json
import random
from pathlib import Path

import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2
Path.ls = lambda x: list(x.iterdir())
# from urllib.parse import quote_plus

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Text Classification with TF-NB-SVM

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit

In [158]:
# Get Data
REVIEW, TAGS = "review", "tags"
def get_reviews(file_path: Path)->pd.DataFrame:
    assert file_path.exists()
    raw_df = pd.read_csv(file_path)
    raw_df.head()
    df = raw_df[[REVIEW, TAGS]]
    return df

file_path = Path("../data/tagged/Uber_5K_Tags_Verified.csv")
df = get_reviews(file_path)

In [159]:
import ast
from typing import List

label_cols: List[str] = [
    "Pickup",
    "Customer Support",
    "Cancellation Fee",
    "Pool",
    "Price",
    "Card Issues",
]

label_cols = sorted(label_cols)

def get_tags_list(tag_input: str) -> List[str]:
    try:
        tag_input = tag_input.strip()
    except AttributeError as ae:
        return ["Unknown"]
    if len(tag_input) < 1:
        return ["Unknown"]
    try:
        tags = ast.literal_eval(tag_input)
        tags = [tag for tag in tags if tag in label_cols]
        return tags
    except SyntaxError as e:
        return ["Unknown"]
    except ValueError as e:
        print(f"{tag_input} caused an error:{e}")


df[TAGS] = df[TAGS].apply(get_tags_list)

In [160]:
print(f"There are {sum(df[TAGS].isna())} rows with invalid tags. This usually happens due to human errors, e.g. typos")

There are 0 rows with invalid tags. This usually happens due to human errors, e.g. typos


# Setup the Dataframe for Multi-class, multi-label prediction

In [161]:
def binarize_labels(df: pd.DataFrame, target_label: str):
    """
    Return a column with 0/1 entries corresponding to a specific column
    """

    def match_label(sample_labels: List[str]) -> int:
        if target_label in sample_labels:
            return 1
        return 0

    return df[TAGS].apply(match_label)

In [162]:
for col_name in label_cols:
    df[f"{col_name}"] = binarize_labels(df, target_label=col_name)

In [163]:
df.head()

Unnamed: 0,review,tags,Cancellation Fee,Card Issues,Customer Support,Pickup,Pool,Price
0,I’ve used Uber a long time and have been able ...,[Pickup],0,0,0,1,0,0
1,I heard so many good stories about people usin...,[Customer Support],0,0,1,0,0,0
2,If you need a cheap ride and you need to get s...,[Cancellation Fee],1,0,0,0,0,0
3,"Uber is Uber, it’s as good as lyft but pricier...","[Customer Support, Price]",0,0,1,0,0,1
4,"When deciding wether to use Uber, Lyft or take...",[Pickup],0,0,0,1,0,0


In [164]:
for col_name in label_cols:
    print(col_name, df[col_name].sum())

Cancellation Fee 1149
Card Issues 901
Customer Support 1565
Pickup 1785
Pool 464
Price 2428


## Create Train-Test Split

In [165]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=37)

In [166]:
for train_index, test_index in sss.split(df[REVIEW], df["Pool"]):
    pass
#     print(f"TRAIN:{train_index}, TEST: {test_index}")
train, test = df.iloc[train_index], df.iloc[test_index]
len(train), len(test)

(3500, 1500)

# TF-IDF matrix for Reviews

In [167]:
"""
We write our own regex tokenizer instead of using spaCy tokenizer.
Why? Because, spaCy used linguistic rules, which in turn, depend on punctuation to make it work correctly. 
Alternatively, I could've tokenized using spaCy and dropped the punctuation tokens from the result
"""
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [168]:
n = train.shape[0]
vec = TfidfVectorizer(
    ngram_range=(1, 2),
    tokenizer=tokenize,
    min_df=3,
    max_df=0.9,
    strip_accents="unicode",
    use_idf=1,
    smooth_idf=1,
    sublinear_tf=1,
)

In [169]:
%%time
trn_term_doc = vec.fit_transform(train[REVIEW])

Wall time: 2.59 s


In [170]:
%%time
test_term_doc = vec.transform(test[REVIEW])

Wall time: 905 ms


In [171]:
# Train and Predict NB-LR 

In [172]:
def pr(y_i, y):
    """
    Basic Naive Bayes Equation
    """
    p = x[y == y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

In [173]:
x = trn_term_doc
test_x = test_term_doc

def get_mdl(y):
    y = y.values
    r = np.log(pr(1, y) / pr(0, y))
    m = LogisticRegression(C=4, dual=False)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [174]:
preds = np.zeros((len(test), len(label_cols)))
preds.shape

(1500, 6)

In [175]:
# Train and Predict

In [176]:
%%time
for i, col in enumerate(label_cols):
    print('fit', col)
    m,r = get_mdl(train[col])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit Cancellation Fee
fit Card Issues
fit Customer Support
fit Pickup
fit Pool
fit Price
Wall time: 7.14 s


In [177]:
df_pred = pd.DataFrame(preds, columns = label_cols)

In [178]:
df_pred.head()

Unnamed: 0,Cancellation Fee,Card Issues,Customer Support,Pickup,Pool,Price
0,0.071093,0.115938,0.414364,0.209997,0.39363,0.235066
1,0.057139,0.138802,0.424029,0.190097,0.016096,0.323736
2,0.358042,0.067424,0.223449,0.258978,0.010402,0.327304
3,0.992528,0.083072,0.108161,0.169787,0.013446,0.585607
4,0.04733,0.046365,0.334215,0.247478,0.007205,0.392426


# Evaluation

In [179]:
def get_binary_pred(x: float, t: float = 0.5) -> int:
    if x > t:
        return 1
    return 0


from sklearn.metrics import precision_recall_fscore_support as prfs

for col in label_cols:
    y_pred = np.array(df_pred[col].apply(get_binary_pred))
    y_true = np.array(test[col])
    p, r, f, s = prfs(y_pred=y_pred, y_true=y_true, average='binary')
    print(f"*{col}*:\tPrecision: {p:.2f},\tRecall: {r:.2f},\tF1: {f:.2f}")

*Cancellation Fee*:	Precision: 0.85,	Recall: 0.60,	F1: 0.70
*Card Issues*:	Precision: 0.89,	Recall: 0.46,	F1: 0.61
*Customer Support*:	Precision: 0.79,	Recall: 0.46,	F1: 0.58
*Pickup*:	Precision: 0.84,	Recall: 0.60,	F1: 0.70
*Pool*:	Precision: 0.89,	Recall: 0.65,	F1: 0.76
*Price*:	Precision: 0.82,	Recall: 0.63,	F1: 0.71
