In [1]:
import numpy as np
from tqdm import tqdm

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

Datasets info: https://huggingface.co/datasets/app_reviews

In [3]:
from datasets import load_dataset

dataset = load_dataset("app_reviews")

Using custom data configuration default
Reusing dataset app_reviews (/home/rodri/.cache/huggingface/datasets/app_reviews/default/0.0.0/af305ac963fd8dff5976dd341e97edf3a2933c3509a58885caace361c5cd3fe3)


In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')


def lemmatizer(text):
    doc = nlp(text)
    return " ".join([t.lemma_ for t in doc])

In [5]:
texts = dataset['train']['review']
targets = dataset['train']['star']

In [13]:
for i in range(5):
    print(texts[i])
    print(f"{targets[i]} stars")
    print("=======================")

Great app! The new version now works on my Bravia Android TV which is great as it's right by my rooftop aerial cable. The scan feature would be useful...any ETA on when this will be available? Also the option to import a list of bookmarks e.g. from a simple properties file would be useful.
4 stars
Great It's not fully optimised and has some issues with crashing but still a nice app  especially considering the price and it's open source.
4 stars
Works on a Nexus 6p I'm still messing around with my hackrf but it works with my Nexus 6p  Trond usb-c to usb host adapter. Thanks!
5 stars
The bandwidth seemed to be limited to maximum 2 MHz or so. I tried to increase the bandwidth but not possible. I purchased this is because one of the pictures in the advertisement showed the 2.4GHz band with around 10MHz or more bandwidth. Is it not possible to increase the bandwidth? If not  it is just the same performance as other free APPs.
3 stars
Works well with my Hackrf Hopefully new updates will arri

In [14]:
n_samples_train = 10_000
n_samples_test = 80_000

In [15]:
from sklearn.model_selection import train_test_split
texts_train, texts_test, y_train, y_test = train_test_split(texts, targets, train_size=n_samples_train, test_size=n_samples_test)


In [16]:
texts_train = list(map(lemmatizer, texts_train))
texts_test = list(map(lemmatizer, texts_test))

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1, 2), min_df=5,
                    stop_words="english")
X_train = cv.fit_transform(texts_train)

In [18]:
X_test = cv.transform(texts_test)

In [19]:
X_train.shape

(10000, 2083)

# Linear Regressor

In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [48]:
lr = LinearRegression().fit(X_train, y_train)

In [49]:
print("Training score: ", r2_score(y_train, lr.predict(X_train)))
print("Test score: ", r2_score(y_test, lr.predict(X_test)))

Training score:  0.5120128087336474
Test score:  0.01747353316725042


In [62]:
n_important_words = 20
print("MOST IMPORTANT {} words".format(n_important_words))
sorted(list(list(zip(cv.get_feature_names(), lr.coef_))), key=lambda x: abs(x[1]), reverse=True)[:n_important_words]

MOST IMPORTANT 20 words


[('qr code', 5.199459722343599),
 ('qr', -4.2339212657850025),
 ('good open', 2.661291651021169),
 ('source app', 2.582404889826378),
 ('uninstalling', -2.4693181095005112),
 ('6p', -2.3904337997190295),
 ('worthless', -2.3143730193324696),
 ('ton', -2.2693345571146204),
 ('crap', -2.268185284893702),
 ('help fix', 2.211517521074139),
 ('uninstalled', -2.1532001186970193),
 ('android lollipop', 2.137874962827495),
 ('implement', 2.1182821178445566),
 ('new device', 2.098916130452183),
 ('thank advance', 2.0433105756039356),
 ('hello', 1.9717081438561912),
 ('anybody', -1.9628783624958042),
 ('fix soon', -1.875960093142918),
 ('landscape', -1.8633577216782073),
 ('authentication', -1.8593369144974021)]

# OMP

In [50]:
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X_train_norm = X_train.toarray()
X_test_norm = X_test.toarray()
higher_norm = max([np.linalg.norm(X_train_norm[:, i]) for i in range(X_train.shape[1])])
       
for i in range(X_train.shape[1]):
    X_train_norm[:, i] = X_train_norm[:, i] * higher_norm / np.linalg.norm(X_train_norm[:, i])
    X_test_norm[:, i] = X_test_norm[:, i] * higher_norm / np.linalg.norm(X_test_norm[:, i])
    


In [51]:
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=50)
omp = omp.fit(X_train_norm, y_train)

In [52]:
print("Training score: ", r2_score(y_train, omp.predict(X_train.toarray())))
print("Test score: ", r2_score(y_test, omp.predict(X_test.toarray())))

Training score:  0.1375916152930886
Test score:  0.12245071098037597


In [53]:
from sklearn.model_selection import GridSearchCV
gcv = GridSearchCV(omp, param_grid={"n_nonzero_coefs": range(50, 200, 20)}, scoring="r2").fit(X_train_norm, y_train)

In [54]:
gcv.best_params_, gcv.best_score_

({'n_nonzero_coefs': 70}, 0.24360338002606324)

In [55]:
print("Training score: ", r2_score(y_train, gcv.predict(X_train.toarray())))
print("Test score: ", r2_score(y_test, gcv.predict(X_test.toarray())))

Training score:  0.14006064763073867
Test score:  0.12238541722925445


In [57]:
not_null_indices = [i for i,x in enumerate(gcv.best_estimator_.coef_) if x != 0]

    
sorted(list(filter(lambda x: x[1] != 0, list(zip(cv.get_feature_names(), omp.coef_)))), key=lambda x: abs(x[1]), reverse=True) 

[('good', 0.37385199931889396),
 ('hate', -0.2737301509432886),
 ('great', 0.2684764264587139),
 ('love', 0.2547241841589462),
 ('waste', -0.2180056923294354),
 ('work', -0.21730611938741895),
 ('update', -0.20757053883780585),
 ('ca', -0.20004980479795875),
 ('useless', -0.19532596923502082),
 ('nice', 0.19514717006953788),
 ('suck', -0.18531270549662388),
 ('bad', -0.1785654430123736),
 ('wo', -0.17363339686900806),
 ('crap', -0.15718869176397868),
 ('slow', -0.15367791595114538),
 ('fix', -0.15275144593278828),
 ('excellent', 0.14898977439908143),
 ('thank', 0.148406022757928),
 ('use', -0.14257288336508833),
 ('bad app', -0.1422452194913341),
 ('crash', -0.1415752209482942),
 ('annoying', -0.13948016769088462),
 ('poor', -0.13692826655806467),
 ('error', -0.13551178371683287),
 ('easy', 0.12958742427365827),
 ('stop', -0.12828252235582652),
 ('buddha', -0.12176985463743467),
 ('space', -0.11477242346391597),
 ('perfect', 0.11193463095756227),
 ('useful', 0.1102721943897527),
 ('wtf