In [342]:
from typing import List, Dict
from hazm import Lemmatizer, Normalizer, word_tokenize, stopwords_list
import pandas as pd
import hazm as hz
import pickle
import nltk

In [343]:
dataset = pd.read_csv('./datasets/Snappfood - Sentiment Analysis.csv', delimiter="\t")
dataset

Unnamed: 0,comment,label,label_id
0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1
1,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0
2,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1
3,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0
4,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0
...,...,...,...
69995,سلام من به فاکتور غذاهایی که سفارش میدم احتیاج...,SAD,1
69996,سایز پیتزا نسبت به سفارشاتی که قبلا گذشتم کم ش...,SAD,1
69997,من قارچ اضافه رو اضافه کرده بودم بودم اما اگر ...,HAPPY,0
69998,همرو بعد ۲ساعت تاخیر اشتباه آوردن پولشم رفت رو...,SAD,1


In [344]:
dataset["label"].value_counts()

SAD      35000
HAPPY    35000
Name: label, dtype: int64

In [345]:
# there is two class of SAD and HAPPY in dataset, take 20% of data with the same class ditribution
dataset = pd.concat([dataset[dataset['label'] == 'HAPPY'].sample(frac=0.2, random_state=97), dataset[dataset['label'] == 'SAD'].sample(frac=0.2, random_state=97)]).reset_index()
dataset["label"].value_counts()

HAPPY    7000
SAD      7000
Name: label, dtype: int64

In [346]:
len(dataset.iloc[-1]["comment"])

73

In [347]:
import re

def preprocessing_pipieline(sentence ,stopwords: List[str] ,lemma: Lemmatizer,normalizer:Normalizer,index: int, dataset_len: int) -> List[str]:
    sentence = normalizer.normalize(sentence)
    sentence = hz.word_tokenize(sentence)
    acc = 0
    #remove all non-persian characters

    for word in sentence:
        if word in stopwords:   
            sentence.remove(word)
        if re.match(r'^[a-zA-Z0-9_]+$', word):
            sentence.remove(word)
        word = lemma.lemmatize(word)
    percentage = 100*(index+1)/dataset_len
    if percentage % 5 == 0:
        print(f"Processing iteration {index+1}/{dataset_len} ({percentage:.0f}%)")

    return sentence

In [348]:
stopwords = hz.stopwords_list()
lemmatizer = hz.Lemmatizer()
normalizer = hz.Normalizer(affix_spacing=False)
dataset["comment"] = dataset.apply(lambda row: preprocessing_pipieline(row["comment"],stopwords,lemmatizer,normalizer,row.name,len(dataset)),axis=1)

Processing iteration 700/14000 (5%)
Processing iteration 1400/14000 (10%)
Processing iteration 2100/14000 (15%)
Processing iteration 2800/14000 (20%)
Processing iteration 3500/14000 (25%)
Processing iteration 4200/14000 (30%)
Processing iteration 4900/14000 (35%)
Processing iteration 5600/14000 (40%)
Processing iteration 6300/14000 (45%)
Processing iteration 7000/14000 (50%)
Processing iteration 7700/14000 (55%)
Processing iteration 8400/14000 (60%)
Processing iteration 9100/14000 (65%)
Processing iteration 9800/14000 (70%)
Processing iteration 10500/14000 (75%)
Processing iteration 11200/14000 (80%)
Processing iteration 11900/14000 (85%)
Processing iteration 12600/14000 (90%)
Processing iteration 13300/14000 (95%)
Processing iteration 14000/14000 (100%)


In [349]:
file = open('./datasets/preprocessed.pkl', 'wb')
with open('./datasets/preprocessed.pkl', 'wb') as f:
    pickle.dump(dataset, f)