In [1]:
import glob
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from scipy.sparse import save_npz
from joblib import dump
import pandas as pd
import numpy as np

from tqdm import tqdm

In [2]:
data_files_list = glob.glob("../local_data/raw/*.json")

In [3]:
data_frame_list = []
for path in tqdm(data_files_list):
    df = pd.read_json(path, orient='records')
    data_frame_list.append(df)

df = pd.concat(data_frame_list, axis=0)
df.head()

100%|██████████| 250/250 [00:57<00:00,  4.38it/s]


Unnamed: 0,title,text,score
0,,Tik tok is the best app ever,5
1,,I love it,5
2,,Nice,5
3,,GREAT,5
4,,Good,3


In [4]:
df['text'] = df['text'].fillna('')

In [5]:
df_small = df.iloc[:1000000]
df_small.shape

(1000000, 3)

In [86]:
vectorizer = CountVectorizer(
                       strip_accents = 'unicode',
                       lowercase = True,
                       stop_words = 'english',
                       token_pattern=r'(?u)(\b[a-z]{2,}\b|[\u263a-\U0001f645])',
                       ngram_range=(1, 1),
                       min_df = 1000,
                       binary=True)

In [87]:
experement_id = 11

In [88]:
text_vectorized = vectorizer.fit_transform(df['text'])

In [89]:
text_vectorized.shape

(25000000, 8284)

In [90]:
save_npz(f'../local_data/data_vectorized_{experement_id}.npz', text_vectorized)

In [91]:
dump(vectorizer, f'./models/vectorizer_{experement_id}.sav')

['./models/vectorizer_11.sav']

In [92]:
df['labels'] = (df['score'] >= 3).astype(np.int32).values

In [20]:
df.to_csv('../local_data/data_labels.csv', columns=['labels'])
print(1)

1


In [None]:
#Small model

In [54]:
text_vectorized = vectorizer.fit_transform(df_small['text'])

In [55]:
text_vectorized.shape

(1000000, 935)

In [56]:
save_npz(f'../local_data/data_vectorized_{experement_id}.npz', text_vectorized)

In [57]:
dump(vectorizer, f'./models/vectorizer_{experement_id}.sav')

['./models/vectorizer_10.sav']

In [62]:
df_small['labels'] = (df_small['score'] >= 3).astype(np.int32).values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
df_small.to_csv('../local_data/data_labels_small.csv', columns=['labels'])
print(1)

1


In [85]:
#Display features

In [67]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(text_vectorized, df_small['labels'])

MultinomialNB()

In [84]:
z = zip(model.feature_log_prob_[1,:], vectorizer.get_feature_names_out())
s = sorted(z,  key=lambda it: it[0])
list(s)

[(-9.870677940487795, 'garbage'),
 (-9.645005942820227, 'advertising'),
 (-9.610262994376354, 'ruined'),
 (-9.590938721549952, '👎'),
 (-9.539645427162402, 'scam'),
 (-9.526101202054644, 'greedy'),
 (-9.495193664591568, 'bother'),
 (-9.473687459370604, 'wasted'),
 (-9.448476040024108, 'awful'),
 (-9.395938706496226, 'joke'),
 (-9.380313388593144, 'cancel'),
 (-9.320144867126693, 'fake'),
 (-9.263392050534243, 'deleting'),
 (-9.256519171246481, '😠'),
 (-9.249693206176081, 'grab'),
 (-9.249693206176081, 'unplayable'),
 (-9.20317319054119, 'dumb'),
 (-9.18070033468913, 'crashed'),
 (-9.128138004598274, 'disappointing'),
 (-9.098462236452157, 'password'),
 (-9.098462236452157, 'zero'),
 (-9.078201051312336, 'trash'),
 (-9.061155184039347, 'respond'),
 (-9.058342242662732, 'asked'),
 (-9.027911125460193, 'uninstalling'),
 (-9.019769967876492, 'absolute'),
 (-9.017070910907329, 'removed'),
 (-8.998378777895175, 'shame'),
 (-8.993101720794332, 'taken'),
 (-8.985237984334116, 'charge'),
 (-8.98