<a href="https://colab.research.google.com/github/VavRe/nlp-ut/blob/main/CA2/PPMI_Sentiment_Snapp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install hazm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.7/316.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394491 sha256=23cf00e4beeda363186dd

In [4]:
import pickle
import pandas as pd
import numpy as np
import string
import hazm
from tqdm import tqdm

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
COLAB_DATASET_PATH = "/content/drive/MyDrive/NLP/CA2/preprocessed.pkl"
LOCAL_DATASET_PATH = './datasets/preprocessed.pkl'
dataset = pd.read_pickle(COLAB_DATASET_PATH)

In [7]:
dataset = dataset.sample(frac=1, random_state=4)
train_data = dataset[:int(len(dataset)*0.9)]
test_data = dataset[int(len(dataset)*0.9):]
print("Dataset size: ", len(dataset))
print("Test Data size: ", len(test_data))
print("Train Data size: ", len(train_data))

Dataset size:  14000
Test Data size:  1400
Train Data size:  12600


In [8]:

vocabulary = set()
tf = dict()


def make_vocab(sentence):

    for word in sentence:
         vocabulary.add(word)


train_data["comment"].apply(make_vocab)


8756     None
5474     None
11242    None
7820     None
7909     None
         ... 
10142    None
8828     None
11554    None
11609    None
904      None
Name: comment, Length: 12600, dtype: object

In [9]:
word_index = dict()
def map_word_index(vocabulary):
    for index,word in enumerate(vocabulary):
        word_index[word] = index
        
map_word_index(vocabulary)

In [10]:
co_occurance_matrix = np.zeros([len(vocabulary),len(vocabulary)])
stopwords = hazm.stopwords_list()
for comment in tqdm(train_data["comment"], desc="Comment Loop"):
    for word in comment:
        for other_words in comment:
            if other_words != word and other_words not in string.punctuation and other_words not in stopwords:
                co_occurance_matrix[word_index[word]][word_index[other_words]] += 1

Comment Loop: 100%|██████████| 12600/12600 [00:45<00:00, 274.71it/s]


In [11]:
def check_most_cooc(word):
  largest_indices = np.argpartition(co_occurance_matrix[word_index[word]], -5)[-5:]
  for i in largest_indices:
    print(list(vocabulary)[i])

check_most_cooc("نوشابه")

گرم
،
ارسال
غذا
سفارش


In [12]:
probability_matrix = co_occurance_matrix/np.sum(co_occurance_matrix)
word_probability = np.sum(probability_matrix,1)

# Vectorized, Efficient Way
ppmi_matrix = np.zeros([len(vocabulary),len(vocabulary)])
outer = np.outer(word_probability, word_probability)
ppmi_matrix = np.where(outer != 0, np.maximum(np.log(probability_matrix/outer), 0), 0)
np.fill_diagonal(ppmi_matrix, 0)

# Not Efficient Way
# ppmi_matrix = np.zeros([len(vocabulary),len(vocabulary)])
# for i in tqdm(range(len(vocabulary)), desc="PPMI Matrix"):
#     for j in range(len(vocabulary)):
#         if i != j:
#             ppmi_matrix[i][j] = max(np.log(probability_matrix[i][j]/(word_probability[i]*word_probability[j])),0)

  ppmi_matrix = np.where(outer != 0, np.maximum(np.log(probability_matrix/outer), 0), 0)
  ppmi_matrix = np.where(outer != 0, np.maximum(np.log(probability_matrix/outer), 0), 0)
  ppmi_matrix = np.where(outer != 0, np.maximum(np.log(probability_matrix/outer), 0), 0)


In [13]:
def fit_ppmi(comment):
    vectors = np.zeros(len(comment),len(vocabulary))
    for idx,word in enumerate(comment):
        if word in vocabulary:
          vectors[idx] = ppmi_matrix[word_index[word]]
    return np.mean(vectors,0)

fit_ppmi = lambda comment: np.mean([ppmi_matrix[word_index[word]]  for word in comment if word in vocabulary],0)

In [14]:
train_data["ppmi_vectorized"] = train_data["comment"].apply(fit_ppmi)
test_data["ppmi_vectorized"] = test_data["comment"].apply(fit_ppmi)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["ppmi_vectorized"] = train_data["comment"].apply(fit_ppmi)
  return _methods._mean(a, axis=axis, dtype=dtype,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data["ppmi_vectorized"] = test_data["comment"].apply(fit_ppmi)


In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
clf = MultinomialNB()
clf.fit(train_data["ppmi_vectorized"].to_list(),train_data["label_id"].to_list())


In [49]:
test_data = test_data.dropna()
test_X = test_data["ppmi_vectorized"].to_numpy()
test = test_data["ppmi_vectorized"].to_numpy()
test = np.vstack(test)

pred = clf.predict(test)

print(classification_report(pred,test_data["label_id"].to_list()))


              precision    recall  f1-score   support

           0       0.70      0.85      0.77       577
           1       0.87      0.75      0.81       822

    accuracy                           0.79      1399
   macro avg       0.79      0.80      0.79      1399
weighted avg       0.80      0.79      0.79      1399

