# Complaints Time to respond model

Author:
* Luis Henrique M O Imagiire

Status:
* DONE

Goal:
* We will try to predict the time to respond model using word embbeding features from our fasttext model trained on complaints description.
We hope to have a good model to inform companies if they are lagging behind our excelling in comparison to their competitors.

In [1]:
%cd ..

/home/luis/ds4a/notebooks


In [2]:
from src.loading import load_dataset
from src.cleaning import build_df_from_RA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata
import string
import re
from gensim.models.fasttext import FastText as FT_gensim

def normalize_text(text):
    """
    Strip accents and lower text string
    :param text: (str) text to be cleaned
    :return: (str) cleaned text
    """
    text = strip_accents(text)
    text = text.lower().strip()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def remove_numbers(text):
    return re.sub(r'\b[0-9]+\b', '', text)

def tokenize(data, sep=None):
    if sep is not None:
        return data.split(sep)
    return data.split()

# Load data
df = build_df_from_RA(load_dataset("dataset"))
# df = df.loc[(df.days_to_resolution < 50)]  # De-biasing response days variable
df["normalized_text"] = df.apply(lambda row: f"{remove_numbers(normalize_text(row['title']))} {remove_numbers(normalize_text(row['description']))}", axis=1)
print(f"We have a total of {df.shape[0]} reviews!")

# Load model
model = FT_gensim.load("fasttext.model")


We have a total of 51655 reviews!


In [5]:
df.days_to_resolution.value_counts()

0.0     1043
1.0      930
2.0      837
3.0      714
6.0      704
5.0      686
4.0      660
7.0      584
8.0      492
9.0      467
10.0     402
12.0     374
13.0     370
11.0     362
14.0     316
15.0     277
16.0     251
17.0     227
18.0     226
19.0     221
20.0     206
21.0     167
22.0     145
27.0     130
25.0     128
23.0     126
28.0     112
24.0     107
26.0      96
29.0      87
34.0      84
33.0      75
31.0      75
36.0      71
32.0      67
30.0      65
35.0      64
41.0      63
39.0      56
37.0      55
42.0      53
40.0      48
38.0      45
43.0      40
48.0      40
46.0      35
44.0      34
47.0      33
45.0      32
49.0      31
Name: days_to_resolution, dtype: int64

In [3]:
def get_review_vec(review_seq):
    try:
        mean_vec = np.array([model.wv[w] for w in review_seq]).mean(axis=0)
        if not isinstance(mean_vec, type(np.nan)):
            return mean_vec
        else:
            return None
    except:
        return None 

features = []
idxes = []
for idx, vec in df.iterrows():
    if vec["normalized_text"]:
        vec = get_review_vec(tokenize(vec["normalized_text"]))
        if vec is not None:
            idxes.append(idx)
            features.append(vec)


  This is separate from the ipykernel package so we can avoid doing imports until
  ret = ret.dtype.type(ret / rcount)


In [5]:
X = pd.DataFrame(np.array(features),columns=[f"feat_{i}" for i in range(100)], index=idxes)
X.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_90,feat_91,feat_92,feat_93,feat_94,feat_95,feat_96,feat_97,feat_98,feat_99
0,0.910237,0.948026,-0.580685,-1.087124,0.408073,0.035028,-0.255468,0.727001,0.51116,0.465338,...,0.22511,-0.556976,-0.199568,-0.029514,-0.384443,0.634457,-1.42605,-1.708523,0.028207,0.248345
1,-0.006765,0.400626,-0.188688,-2.611415,0.161582,1.001012,-1.393652,0.024986,-0.286601,0.050815,...,0.643617,-0.681656,-2.090754,0.766479,-0.360157,-0.899276,-0.045452,-1.029047,-0.453788,0.370695
2,-0.295514,-0.242192,-1.230961,-2.276299,-0.154886,0.099664,-1.411958,-0.046187,-0.566358,-0.810014,...,0.156349,-0.403661,-1.800049,0.152794,0.364936,-0.128633,-0.46929,-1.516815,0.014385,0.350531
3,0.538521,-0.414169,-0.821009,-2.062811,-1.239713,0.311726,-0.904301,0.296257,0.148219,-1.170139,...,0.397926,-1.703135,-2.093588,0.46781,0.270992,-1.369555,-0.121953,-1.803416,0.848096,1.088491
4,0.502684,0.10189,-0.161741,-2.076119,-0.018616,0.181313,-1.878871,-0.759554,-0.606767,-1.163182,...,0.418289,-1.644738,-2.11283,1.213724,0.711859,-0.254544,-1.367691,-1.562384,-0.460997,0.439617


In [6]:
X["days_to_first_contact"] = df.days_to_first_contact[X.index]
X.days_to_first_contact = X.days_to_first_contact.apply(lambda x: int(x) if not isinstance(x, type(np.nan)) else -999)
X.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_91,feat_92,feat_93,feat_94,feat_95,feat_96,feat_97,feat_98,feat_99,days_to_first_contact
0,0.910237,0.948026,-0.580685,-1.087124,0.408073,0.035028,-0.255468,0.727001,0.51116,0.465338,...,-0.556976,-0.199568,-0.029514,-0.384443,0.634457,-1.42605,-1.708523,0.028207,0.248345,-999
1,-0.006765,0.400626,-0.188688,-2.611415,0.161582,1.001012,-1.393652,0.024986,-0.286601,0.050815,...,-0.681656,-2.090754,0.766479,-0.360157,-0.899276,-0.045452,-1.029047,-0.453788,0.370695,-999
2,-0.295514,-0.242192,-1.230961,-2.276299,-0.154886,0.099664,-1.411958,-0.046187,-0.566358,-0.810014,...,-0.403661,-1.800049,0.152794,0.364936,-0.128633,-0.46929,-1.516815,0.014385,0.350531,-999
3,0.538521,-0.414169,-0.821009,-2.062811,-1.239713,0.311726,-0.904301,0.296257,0.148219,-1.170139,...,-1.703135,-2.093588,0.46781,0.270992,-1.369555,-0.121953,-1.803416,0.848096,1.088491,-999
4,0.502684,0.10189,-0.161741,-2.076119,-0.018616,0.181313,-1.878871,-0.759554,-0.606767,-1.163182,...,-1.644738,-2.11283,1.213724,0.711859,-0.254544,-1.367691,-1.562384,-0.460997,0.439617,-999


In [28]:
# If want to include tag variables
X = X.merge(df[[col for col in df.columns if any([i in col for i in ["issues", "other"]])]], left_index=True, right_index=True)
X["days_to_first_contact"] = df.days_to_first_contact[X.index]
for col in [i for i in X.columns if "feat_" not in i]:
    X[col] = X[col].apply(lambda x: int(x) if not isinstance(x, type(np.nan)) else -999)
X.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,others,product_issues_Quality,product_issues_Damaged,product_issues_Electrical problems,product_issues_Missing pieces,business_issues_Payment,business_issues_Maintenance,business_issues_Customer Services,business_issues_Delivery,business_issues_Online Services
1,-0.006765,0.400626,-0.188688,-2.611415,0.161582,1.001012,-1.393652,0.024986,-0.286601,0.050815,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,-0.295514,-0.242192,-1.230961,-2.276299,-0.154886,0.099664,-1.411958,-0.046187,-0.566358,-0.810014,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
6,0.88628,-0.675235,0.047963,-0.962227,0.60174,0.085034,-1.32945,-1.103234,-1.15609,-0.237346,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
9,0.684459,1.257818,0.084625,-1.728545,-0.643135,-0.032798,-0.895743,1.159709,0.172128,1.267814,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
10,-0.080067,0.675423,-0.609974,-1.799212,-0.504865,-0.308095,-0.681252,0.600193,-0.556919,-0.784279,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [7]:
Y = df.days_to_resolution[idxes]
Y.head()

0    451.0
1      5.0
2      3.0
3      NaN
4     61.0
Name: days_to_resolution, dtype: float64

In [14]:
Y = Y.dropna()
X = X.loc[Y.index]

In [19]:
from sklearn.model_selection import train_test_split

X_test = X.iloc[-100::]
Y_test = Y.iloc[-100::]

X_t = X.iloc[0:-100]
Y_t = Y.iloc[0:-100]

# Sanity check
assert X_test.shape[0] + X_t.shape[0] == X.shape[0]
assert Y_test.shape[0] + Y_t.shape[0] == Y.shape[0]

seed = 2020
X_train, X_validation, y_train, y_validation = train_test_split(X_t, Y_t, train_size=0.80, random_state=seed)

In [20]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score


model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42,
    iterations=50,
#     logging_level='Silent'
)

model.fit(
    X_train, y_train,
    cat_features=[100],
    eval_set=(X_validation, y_validation),
    logging_level='Verbose'
);



CatBoostError: catboost/private/libs/target/target_converter.cpp:228: Unknown class label: "649"

In [43]:

from sklearn.metrics import accuracy_score

predictions = model.predict(X_test)
# predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(Y_test[:10])
accuracy_score(Y_test, predictions)


[[7.]
 [2.]
 [1.]
 [0.]
 [0.]
 [1.]
 [4.]
 [2.]
 [4.]
 [0.]]
50908    7.0
50925    7.0
50929    5.0
50932    6.0
50936    1.0
50937    5.0
50939    2.0
50948    6.0
50950    7.0
50964    4.0
Name: days_to_resolution, dtype: float64


0.05

In [37]:
1/50

0.02

## Remarks

Model is pretty bad. Maybe there is no much signal in the description.

Lets try to reduce the target space into meaningful categories and see if our model improves.

In [21]:
def categorize_resolutions(day):
    if day == 0:
        return "jit"
    if day < 7:
        return "fast"
    if day < 15:
        return "slow"
    else:
        return "super-slow"

df.days_to_resolution.apply(categorize_resolutions)

0        super-slow
1              fast
2              fast
3        super-slow
4        super-slow
            ...    
51650    super-slow
51651    super-slow
51652    super-slow
51653          fast
51654    super-slow
Name: days_to_resolution, Length: 51655, dtype: object

In [70]:
X = pd.DataFrame(np.array(features),columns=[f"feat_{i}" for i in range(100)], index=idxes)
X["days_to_first_contact"] = df.days_to_first_contact[X.index]
X.days_to_first_contact = X.days_to_first_contact.apply(lambda x: int(x) if not isinstance(x, type(np.nan)) else -999)
X.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_91,feat_92,feat_93,feat_94,feat_95,feat_96,feat_97,feat_98,feat_99,days_to_first_contact
1,-0.006765,0.400626,-0.188688,-2.611415,0.161582,1.001012,-1.393652,0.024986,-0.286601,0.050815,...,-0.681656,-2.090754,0.766479,-0.360157,-0.899276,-0.045452,-1.029047,-0.453788,0.370695,-999
2,-0.295514,-0.242192,-1.230961,-2.276299,-0.154886,0.099664,-1.411958,-0.046187,-0.566358,-0.810014,...,-0.403661,-1.800049,0.152794,0.364936,-0.128633,-0.46929,-1.516815,0.014385,0.350531,-999
6,0.88628,-0.675235,0.047963,-0.962227,0.60174,0.085034,-1.32945,-1.103234,-1.15609,-0.237346,...,-0.571886,-0.603783,0.522473,0.392724,1.229973,1.597743,-0.460302,0.256988,-1.32049,-999
9,0.684459,1.257818,0.084625,-1.728545,-0.643135,-0.032798,-0.895743,1.159709,0.172128,1.267814,...,-1.434719,-1.129213,0.458269,-1.102501,-1.044434,0.668733,-1.806756,0.185641,0.708035,-999
10,-0.080067,0.675423,-0.609974,-1.799212,-0.504865,-0.308095,-0.681252,0.600193,-0.556919,-0.784279,...,-0.576822,-2.366271,0.642208,-0.215703,-1.267039,-0.225375,-1.820242,0.530137,0.286935,-999


In [22]:
Y = Y.apply(categorize_resolutions)
# Y = df.days_to_resolution.apply(categorize_resolutions)[idxes]
Y.head()

0    super-slow
1          fast
2          fast
4    super-slow
6           jit
Name: days_to_resolution, dtype: object

In [23]:
from sklearn.model_selection import train_test_split

X_test = X.iloc[-100::]
Y_test = Y.iloc[-100::]

X_t = X.iloc[0:-100]
Y_t = Y.iloc[0:-100]

# Sanity check
assert X_test.shape[0] + X_t.shape[0] == X.shape[0]
assert Y_test.shape[0] + Y_t.shape[0] == Y.shape[0]

seed = 2020
X_train, X_validation, y_train, y_validation = train_test_split(X_t, Y_t, train_size=0.80, random_state=seed)

In [25]:


model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42,
    iterations=500,
#     logging_level='Silent'
)

model.fit(
    X_train, y_train,
#     cat_features=[100],
    eval_set=(X_validation, y_validation),
    logging_level='Verbose'
);

Learning rate set to 0.148145
0:	learn: 1.3413370	test: 1.3412171	best: 1.3412171 (0)	total: 27.5ms	remaining: 13.7s
1:	learn: 1.3088352	test: 1.3094747	best: 1.3094747 (1)	total: 49.1ms	remaining: 12.2s
2:	learn: 1.2837673	test: 1.2844749	best: 1.2844749 (2)	total: 75ms	remaining: 12.4s
3:	learn: 1.2643397	test: 1.2657662	best: 1.2657662 (3)	total: 101ms	remaining: 12.5s
4:	learn: 1.2487758	test: 1.2508212	best: 1.2508212 (4)	total: 122ms	remaining: 12.1s
5:	learn: 1.2366540	test: 1.2392385	best: 1.2392385 (5)	total: 142ms	remaining: 11.7s
6:	learn: 1.2263489	test: 1.2300856	best: 1.2300856 (6)	total: 162ms	remaining: 11.4s
7:	learn: 1.2177338	test: 1.2229710	best: 1.2229710 (7)	total: 182ms	remaining: 11.2s
8:	learn: 1.2112032	test: 1.2166774	best: 1.2166774 (8)	total: 207ms	remaining: 11.3s
9:	learn: 1.2054419	test: 1.2117319	best: 1.2117319 (9)	total: 238ms	remaining: 11.6s
10:	learn: 1.2009619	test: 1.2079468	best: 1.2079468 (10)	total: 261ms	remaining: 11.6s
11:	learn: 1.1964440	

In [26]:

from sklearn.metrics import accuracy_score

predictions = model.predict(X_test)
# predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(Y_test[:10])
accuracy_score(Y_test, predictions)


[['super-slow']
 ['super-slow']
 ['fast']
 ['fast']
 ['super-slow']
 ['super-slow']
 ['super-slow']
 ['super-slow']
 ['super-slow']
 ['super-slow']]
50925    slow
50929    fast
50932    fast
50936    fast
50937    fast
50939    fast
50948    fast
50950    slow
50964    fast
50968    slow
Name: days_to_resolution, dtype: object


0.45

In [27]:
print(f"Random Baseline: {1/4}")

Random Baseline: 0.25


In [79]:
df["products_Home Appliances"].unique()

array([nan])