In [1]:
import pickle
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
COLAB_DATASET_PATH = "/content/drive/MyDrive/NLP/CA2/preprocessed.pkl"
LOCAL_DATASET_PATH = './datasets/preprocessed.pkl'
dataset = pd.read_pickle(COLAB_DATASET_PATH)

In [4]:
dataset = dataset.sample(frac=1, random_state=4)
train_data = dataset[:int(len(dataset)*0.9)]
test_data = dataset[int(len(dataset)*0.9):]
print("Dataset size: ", len(dataset))
print("Test Data size: ", len(test_data))
print("Train Data size: ", len(train_data))

Dataset size:  14000
Test Data size:  1400
Train Data size:  12600


In [5]:

vocabulary = set()
tf = dict()


def make_vocab(sentence):

    for word in sentence:
         vocabulary.add(word)


train_data["comment"].apply(make_vocab)


8756     None
5474     None
11242    None
7820     None
7909     None
         ... 
10142    None
8828     None
11554    None
11609    None
904      None
Name: comment, Length: 12600, dtype: object

In [6]:
train_data["comment"].apply(make_vocab)

8756     None
5474     None
11242    None
7820     None
7909     None
         ... 
10142    None
8828     None
11554    None
11609    None
904      None
Name: comment, Length: 12600, dtype: object

Check the difference between the set and dict below

In [7]:
word_index = dict()
word_idf = dict()
def map_word_index(vocabulary):
    for index,word in enumerate(vocabulary):
        word_index[word] = index
        

In [8]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0xffffffff
cpu MHz		: 2199.998
cache size	: 56320 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa mmio_stale_data retbleed
bogomips	: 4399.99
clflush size	: 64
cache_alignment	: 64
addres

In [9]:
from tqdm import tqdm
def compute_idf(word,dataset_req):
    count = 0
    for comment in dataset_req["comment"]:
        if word in comment:
            count += 1
    return np.log(len(dataset_req["comment"])/count)

for index, word in enumerate(tqdm(vocabulary)):
    # print(index)
    word_idf[word] = compute_idf(word,train_data)
    percentage = 100*(index+1)/len(vocabulary)
    if percentage % 1 == 0:
        print(f"Processing iteration {index+1}/{len(vocabulary)} ({percentage:.0f}%)")




100%|██████████| 13783/13783 [02:49<00:00, 81.14it/s]

Processing iteration 13783/13783 (100%)





In [10]:

def compute_tf(word,comment):
    freq = 0 
    for part in comment:        
        if word == part:
            freq += 1
    return freq/len(comment)
    

def fit_tfidf(comment,index,dataset_len):
    vec = np.zeros(len(vocabulary))
    for word in comment:
        if word in vocabulary:
            tf = compute_tf(word,comment)
            vec[word_index[word]] = tf*word_idf[word]
    
    
    percentage = 100*(index+1)/dataset_len
    if percentage % 2 == 0:
        print(f"Processing iteration {index+1}/{dataset_len} ({percentage:.0f}%)")

    return vec

In [11]:
train_data

Unnamed: 0,index,comment,label,label_id
8756,57756,"[بسیار, زود, زودتر, زمان, انتظار, ممنون, ., پی...",SAD,1
5474,27700,"[پیشنهاد, میدم, غذاهاشون, تست]",HAPPY,0
11242,53635,"[یه, برگر, ۱, ساعت, نیم, کشید, بجای, غذا, آمبو...",SAD,1
7820,36810,"[غذا, از, یکساعت, کشید, برسه, پیک, که, رستوران...",SAD,1
7909,55278,"[نونش, واقعا, نا, مناسبه, این, قسمت, دورچین, و...",SAD,1
...,...,...,...,...
10142,21171,"[رستوران, منطقه‌ست, پاستای, خوشمزه, قابل, قبول...",SAD,1
8828,39351,"[واقعا, افتضاح, ., هیچکس, پیشنهاد, نمیکنم, .]",SAD,1
11554,21938,"[اسم, شرینی, کیک, تنوری, هست, ،, قاعدتا, اساس,...",SAD,1
11609,16224,"[واقعا, بد, و, اصلا, شبیه, برند, باماهاس, ۲, ا...",SAD,1


In [12]:
map_word_index(vocabulary)
# train_data = train_data.reset_index().sort_index()
train_data["vectorized"] = train_data.apply(lambda row: fit_tfidf(row["comment"],row.name,len(train_data)),axis=1)


Processing iteration 6300/12600 (50%)
Processing iteration 11592/12600 (92%)
Processing iteration 1008/12600 (8%)
Processing iteration 13608/12600 (108%)
Processing iteration 4536/12600 (36%)
Processing iteration 7560/12600 (60%)
Processing iteration 6048/12600 (48%)
Processing iteration 4788/12600 (38%)
Processing iteration 5544/12600 (44%)
Processing iteration 2268/12600 (18%)
Processing iteration 1764/12600 (14%)
Processing iteration 3780/12600 (30%)
Processing iteration 5796/12600 (46%)
Processing iteration 504/12600 (4%)
Processing iteration 8064/12600 (64%)
Processing iteration 3528/12600 (28%)
Processing iteration 1512/12600 (12%)
Processing iteration 4284/12600 (34%)
Processing iteration 10584/12600 (84%)
Processing iteration 13356/12600 (106%)
Processing iteration 6552/12600 (52%)
Processing iteration 7056/12600 (56%)
Processing iteration 5292/12600 (42%)
Processing iteration 12852/12600 (102%)
Processing iteration 1260/12600 (10%)
Processing iteration 5040/12600 (40%)
Process

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["vectorized"] = train_data.apply(lambda row: fit_tfidf(row["comment"],row.name,len(train_data)),axis=1)


In [13]:
test_data = test_data.reset_index().sort_index()
test_data["vectorized"] = test_data.apply(lambda row: fit_tfidf(row["comment"],row.name,len(test_data)),axis=1)


Processing iteration 28/1400 (2%)
Processing iteration 56/1400 (4%)
Processing iteration 84/1400 (6%)
Processing iteration 112/1400 (8%)
Processing iteration 140/1400 (10%)
Processing iteration 168/1400 (12%)
Processing iteration 196/1400 (14%)
Processing iteration 224/1400 (16%)
Processing iteration 252/1400 (18%)
Processing iteration 280/1400 (20%)
Processing iteration 308/1400 (22%)
Processing iteration 336/1400 (24%)
Processing iteration 364/1400 (26%)
Processing iteration 392/1400 (28%)
Processing iteration 420/1400 (30%)
Processing iteration 448/1400 (32%)
Processing iteration 476/1400 (34%)
Processing iteration 504/1400 (36%)
Processing iteration 532/1400 (38%)
Processing iteration 560/1400 (40%)
Processing iteration 588/1400 (42%)
Processing iteration 616/1400 (44%)
Processing iteration 644/1400 (46%)
Processing iteration 672/1400 (48%)
Processing iteration 700/1400 (50%)
Processing iteration 728/1400 (52%)
Processing iteration 756/1400 (54%)
Processing iteration 784/1400 (56%)

In [14]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(train_data["vectorized"].to_list(),train_data["label_id"].to_list())

In [15]:
y_pred = clf.predict(test_data["vectorized"].to_list())

In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_pred,test_data["label_id"].to_list()))


              precision    recall  f1-score   support

           0       0.75      0.86      0.80       605
           1       0.88      0.78      0.83       795

    accuracy                           0.82      1400
   macro avg       0.82      0.82      0.82      1400
weighted avg       0.83      0.82      0.82      1400

