In [1]:
import numpy as np
import pandas as pd
import re
import tqdm

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from run_lr.run_lr import run_lr_reg
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold

pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)

In [2]:
df = pd.read_pickle('../data/input/300K_yelp_text_df.pickle')
print(df.shape)

(300000, 9)


In [3]:
N_FIRST = int(1e4)

df = df.iloc[:N_FIRSTобобщение]

In [4]:
def clean_garbage(text: str) -> str:
    return re.sub(r'[!\"#$%&\'()*\+,-./:;<=>?@\[\\\]^_`{|}~]+', ' ', text.lower())


def stemm_tokenizer(text: str, stemmer, tknzr) -> str:
    cleaned_text = clean_garbage(text)
    stemmed_text = ' '.join([stemmer.stem(w) for w in tknzr.tokenize(cleaned_text) if w.isalpha()])
    return stemmed_text

In [5]:
stemmer = SnowballStemmer('english')
tokenizer = TweetTokenizer()

tqdm.tqdm_pandas(tqdm.tqdm)
df['stemmed_text'] = df['text'].progress_map(lambda z: stemm_tokenizer(z, stemmer, tokenizer)).values


  from pandas import Panel
100%|████████████████████████████████████| 10000/10000 [00:11<00:00, 881.28it/s]


In [6]:
kfold_params = {
    'split_rand_state': 42,
    'split_ratio': .75
}

In [7]:
train_df, val_df = train_test_split(df,
                                    test_size=(1-kfold_params['split_ratio']), 
                                    random_state=kfold_params['split_rand_state'])

In [42]:
tfidf_vect = TfidfVectorizer(ngram_range=(1, 2))

X_train = tfidf_vect.fit_transform(train_df['stemmed_text'].values)
X_val = tfidf_vect.transform(val_df['stemmed_text'].values)

### Model

In [50]:
log_params = {
    'result_path': '../data/result/lr/',
    'description': 'simple lr, no tunning, just to check'
}

lr_params = {
    
}

TARGET = 'useful'

In [51]:
kfold = KFold(n_splits=5)

In [52]:
res = run_lr_reg(train=X_train,
                val=X_val,
                train_labels=train_df[TARGET],
                val_labels=val_df[TARGET],
                lr_params={},
                log_params=log_params,
                kfold=kfold,
                metric=mean_absolute_error,
                extra_params={'objective': 'reg'}
              )

../data/result/lr/10-30-17


In [53]:
preds_val = np.clip(res[1].mean(0), 0, train_df[TARGET].max())

In [54]:
print('MAE model --> {:.3}'.format(mean_absolute_error(preds_val, val_df[TARGET])))

MAE model --> 1.19


In [57]:
mae_naiv = mean_absolute_error([val_df[TARGET].mean()] * val_df.shape[0], val_df[TARGET])
print('MAE naiv --> {:.3}'.format(mae_naiv))

MAE naiv --> 1.14
