# Tehnike i modeli analize sentimenta u NLP-u

## 1. Priprema projekta

In [11]:
!pip install nltk
!pip install scikit-learn
!pip install pandas
!pip install autocorrect
!pip install matplotlib
!pip install gensim
!pip install numpy

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0m

In [None]:
# Priprema foldera
!mkdir -p inputs
!mkdir -p outputs

# 2. Učitavanje podataka

<p>Kao izvor podataka, koristio sam <a href="http://help.sentiment140.com/for-students">Sentiment-140</a> dataset za analizu sentimenta.</p>

In [1]:
import nltk
import pandas as pd

In [2]:
colNames = [
    'polarity',
    'id',
    'post_datetime',
    'query',
    'user',
    'tweet'
]

df = pd.read_csv(
    'inputs/sentiment-140-dataset.csv', 
    encoding='UTF', 
    names=colNames,
    encoding_errors='ignore'
)

<p>Možemo iskoristiti random biblioteku za učitavanje nasumičnog subseta podataka. Ja ću stalno koristiti isti seed tako da svaki put dobijem isti subset podataka.</p>
<p>Uzeli smo subset od 20000 podataka zato da možemo relativno brzo istrenirati model.</p>

In [3]:
import random
# uzmi random subset
# ukoliko zelis da ti svaki put subset bude isti, uzmi isti seed
df_subset = df[['polarity', 'tweet']].sample(20000, random_state=46)
# spremanje naseg subseta u csv
df_subset.to_csv('outputs/sentiment-140-subset.csv', index=False)

In [4]:
df_subset = pd.read_csv('outputs/sentiment-140-subset.csv')

In [5]:
df_subset.polarity.value_counts()

polarity
0    10009
4     9991
Name: count, dtype: int64

<p>Možemo primjetiti da naš dataset nema drugih polariteta osim pozitivnog i negativnog (nema neutralnog polariteta). To znači da će nam biti jednostavnije trenirati naš model.</p>

In [6]:
df_subset.polarity.replace(4, 1, inplace=True)

<p>Većina binarnih algoritama očekuje 0 ili 1 kao klasifikaciju. Zato ćemo, radi jednostavnosti, sve četvorke zamijeniti s jedinicama.</p>

## 3. Preprocesiranje teksta

In [7]:
# ciscenje teksta
import re
from autocorrect import Speller
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [8]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/tomislav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tomislav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [110]:
def formalize(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def removeMentionsAndHashTags(text):
    hashtagPattern = re.compile(r"#[A-Za-z0-9_]+")
    mentionPattern = re.compile(r"@[A-Za-z0-9_]+")
    
    #Removing hashtags and mentions
    text = hashtagPattern.sub("", text)
    text = mentionPattern.sub("", text)

    return text

def removeUrls(text):
    urlPatterns = [
        re.compile(r"https?://\S+"),
        re.compile(r"www\.\S+")
    ]
    #Removing links
    for pattern in urlPatterns:
        text = pattern.sub("", text)
    return text

def removeNumbers(text):
    #removing numbers
    text = re.sub("[0-9]","", text)
    return text

# mozda maknuti
def removePunctuation(text):
    #removing punctuation
    text = re.sub(r'[^\w\s]','', text)
    return text

def removeSingleQuotes(text):
    #removing single quotes
    text = re.sub(r"\'","", text)
    return text

def removeStopwords(text):
    #removing stopwords
    text = text.split(' ')
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def preprocess(
    text,
    HASH_TAGS_MENTIONS=False,
    URLS=False,
    NUMBERS=False,
    PUNCTUATION=False,
    SINGLE_QUOTES=False,
    STOP_WORDS=True,
    SHORT_WORDS=False
):
    spell = Speller(lang='en')
    lemmatizer = WordNetLemmatizer()

    text = " ".join([word.lower() for word in text.split()])

    if(not HASH_TAGS_MENTIONS):
        text = removeMentionsAndHashTags(text)
    if(not URLS):
        text = removeUrls(text)
    if(not NUMBERS):
        text = removeNumbers(text)
    if(not STOP_WORDS):
        text = removeStopwords(text)
    if(not PUNCTUATION):
        text = removePunctuation(text)  
    if(not SINGLE_QUOTES):
        text = removeSingleQuotes(text)
    
    tokens = word_tokenize(text)
    tokens = [formalize(token) for token in tokens]
    tokens = [spell(token) for token in tokens]
    tokens = [lemmatizer.lemmatize(token) for token in tokens if len(token) > 1]

    # remove short words
    if(not SHORT_WORDS):
        tokens = [token for token in tokens if len(token) > 2]

    return " ".join(tokens)

In [None]:
df_subset.tweet.values[0]

In [111]:
tweets = [preprocess(tweet) for tweet in df_subset.tweet.values]

In [112]:
from sklearn.model_selection import train_test_split

# podjela na train i test set
X_train, X_test, y_train, y_test = train_test_split(
    tweets,
    df_subset.polarity.values,
    test_size=0.2
)

## 4. Naive Bayes klasifikatori

### Bernoulli Naive Bayes klasifikator koristeći one-hot encoding

In [113]:
from nltk import FreqDist
from nltk.classify import NaiveBayesClassifier
from sklearn.metrics import classification_report, accuracy_score

In [114]:
def buildDataset(X,y):
    #build the dataset
    words = [word_tokenize(sent) for sent in X]
    dataset = list(zip(words, y))
    
    return dataset

all_words = FreqDist(sum([w.split(" ") for w in X_train],[]))
word_features = list(all_words)[:2000]
def documentFeatures(words):
    wordSet = set(words)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in wordSet)

    return features

In [164]:
with open('outputs/bernoulli_array.pickle', 'wb') as f:
    pickle.dump(word_features, f)

In [115]:
train_set = [(documentFeatures(d), y) for (d,y) in buildDataset(X_train, y_train)]
test_set = [(documentFeatures(d), y) for (d,y) in buildDataset(X_test, y_test)]

In [116]:
import pickle

def saveClassifier(classifier, filePath):
   f = open(filePath, 'wb')
   pickle.dump(classifier, f, -1)
   f.close()

def loadClassifier(filePath):
   f = open(filePath, 'rb')
   classifier = pickle.load(f)
   f.close()
   return classifier

In [117]:
nb_classifier = NaiveBayesClassifier.train(train_set)

In [118]:
# Preciznost modela:
y_pred = [nb_classifier.classify(d) for (d, _) in test_set]
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.77      0.74      0.75      2083
           1       0.73      0.76      0.74      1917

    accuracy                           0.75      4000
   macro avg       0.75      0.75      0.75      4000
weighted avg       0.75      0.75      0.75      4000



In [119]:
nb_classifier.show_most_informative_features(10)

Most Informative Features
          contains(arch) = True                0 : 1      =     15.3 : 1.0
           contains(sad) = True                0 : 1      =     14.9 : 1.0
           contains(rip) = True                0 : 1      =     13.2 : 1.0
         contains(smile) = True                1 : 0      =     13.0 : 1.0
         contains(proud) = True                1 : 0      =     12.1 : 1.0
         contains(sadly) = True                0 : 1      =     12.0 : 1.0
          contains(sick) = True                0 : 1      =     11.6 : 1.0
         contains(bless) = True                1 : 0      =     11.5 : 1.0
       contains(welcome) = True                1 : 0      =     11.5 : 1.0
           contains(boo) = True                0 : 1      =     11.4 : 1.0


In [120]:
saveClassifier(nb_classifier, 'outputs/bernoulli_naive_bayes.pickle')

**Zanimljiv rezultat se događa ukoliko pokušamo maknuti stopwords iz modela**

In [121]:
X_train_no_stops = [preprocess(tweet, STOP_WORDS=True) for tweet in X_train]
X_test_no_stops = [preprocess(tweet, STOP_WORDS=True) for tweet in X_test]

In [27]:
train_set = [(documentFeatures(d), y) for (d,y) in buildDataset(X_train_no_stops, y_train)]
test_set = [(documentFeatures(d), y) for (d,y) in buildDataset(X_test_no_stops, y_test)]

In [122]:
nb_no_stops_classifier = NaiveBayesClassifier.train(train_set)

<p>U nekim slučajevima, stopwords poboljšavaju sveukupnu točnost modela, a nekad poboljšavaju prosječnu točnost modela, ali pogoršavaju točnost predviđanja pozitivnih rezultata, ovisno o datasetu.</p>

In [123]:
y_pred = [nb_no_stops_classifier.classify(d) for (d, _) in test_set]
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.77      0.74      0.75      2083
           1       0.73      0.76      0.74      1917

    accuracy                           0.75      4000
   macro avg       0.75      0.75      0.75      4000
weighted avg       0.75      0.75      0.75      4000



In [124]:
saveClassifier(nb_no_stops_classifier, 'outputs/bernoulli_naive_bayes_no_stops.pickle')

### Gaussian Naive Bayes klasifikator koristeći one-hot encoding

In [161]:
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=2000)
X_train_gaussian = vectorizer.fit_transform(X_train)
X_test_gaussian = vectorizer.transform(X_test)

vectorizer.

In [141]:
pickle.dump(vectorizer, open('outputs/count_vectorizer.pickle', 'wb'))

In [32]:
# treniranje modela
gaussian_nb_classifier = GaussianNB()
gaussian_nb_classifier.fit(X_train_gaussian.toarray(), y_train)
y_pred = gaussian_nb_classifier.predict(X_test_gaussian.toarray())

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print a classification report with precision, recall, and F1-score
print(classification_report(y_test, y_pred))

Accuracy: 0.63
              precision    recall  f1-score   support

           0       0.72      0.45      0.56      2033
           1       0.59      0.82      0.69      1967

    accuracy                           0.63      4000
   macro avg       0.66      0.64      0.62      4000
weighted avg       0.66      0.63      0.62      4000



In [33]:
saveClassifier(gaussian_nb_classifier, 'outputs/gaussian_naive_bayes.pickle')

### Multinomial Naive Bayes klasifikator koristeći TF-IDF vektorizaciju

In [143]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2000)

In [144]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [145]:
from sklearn.naive_bayes import MultinomialNB

multi_nb_classifier = MultinomialNB()
multi_nb_classifier.fit(X_train_tfidf.toarray(), y_train)
y_pred = multi_nb_classifier.predict(X_test_tfidf.toarray())

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print a classification report with precision, recall, and F1-score
print(classification_report(y_test, y_pred))

Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      2083
           1       0.73      0.75      0.74      1917

    accuracy                           0.75      4000
   macro avg       0.75      0.75      0.75      4000
weighted avg       0.75      0.75      0.75      4000



In [146]:
saveClassifier(multi_nb_classifier, 'outputs/multinomial_naive_bayes.pickle')

In [147]:
pickle.dump(vectorizer, open('outputs/tfidf_vectorizer.pickle', 'wb'))

## 5. Logistička regresija

### Logistička regresija koristeći word2vec

In [37]:
from nltk.corpus import brown

In [38]:
from gensim.models import Word2Vec

In [39]:
# priprema word2vec modela
vectorSize = 100
word2VecModel = Word2Vec(
    sentences = [[word.lower() for word in sent] for sent in brown.sents()], 
    vector_size = vectorSize, 
    window = 5, 
    min_count = 2, 
    sg = 0
)

In [148]:
import numpy as np
# Kreiranje word embeddinga (korištenjem average word vectors tehnike)
def averageWordVectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0

    for word in words:
        if word in vocabulary:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])

    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)

    return feature_vector


In [149]:
X_train_w2v = [averageWordVectors(doc, word2VecModel, word2VecModel.wv.index_to_key, vectorSize) for doc in [word_tokenize(sent) for sent in X_train]]
X_test_w2v = [averageWordVectors(doc, word2VecModel, word2VecModel.wv.index_to_key, vectorSize) for doc in [word_tokenize(sent) for sent in X_test]]

In [150]:
# Treniranje modela logističke regresije
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train_w2v, y_train)
y_pred = lr_model.predict(X_test_w2v)
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.65      0.62      0.63      2083
           1       0.61      0.63      0.62      1917

    accuracy                           0.63      4000
   macro avg       0.63      0.63      0.63      4000
weighted avg       0.63      0.63      0.63      4000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [156]:
saveClassifier(lr_model, 'outputs/logistic_regression.pickle')

## 6. LSTM - Long Short Term Memory Network

In [157]:
import numpy as np
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential,Model
from keras.layers import Embedding,Dense,Dropout,Bidirectional,LSTM

In [45]:
tokenizer = Tokenizer(num_words=None) #num_words označava maksimalan broj riječi koje će se uzeti u obzir
#num_words=None znači da će se uzeti sve riječi

tokenizer.fit_on_texts(X_train)

In [46]:
dict_size = len(tokenizer.word_counts.items())
num_of_tokens = dict_size + 1

In [47]:
#Pripremljene podatke pretvaramo u nizove

X_seq = tokenizer.texts_to_sequences(X_train)

#Spremamo najdulju duljinu nekog tweeta
max_tweet_length = np.max(np.array([len(t) for t in X_seq]))

X_train_padded = pad_sequences(X_seq, maxlen=max_tweet_length, padding='post')


In [48]:
def load_embedding_model(file):
    embedding_model = {}
    with open(file,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            embedding_model[word] = embedding
    return embedding_model

In [49]:
embedding_index_fasttext = load_embedding_model('inputs/wiki-news-300d-1M.vec')

In [50]:
def embedding_matrix(num_tokens,embedding_dim,embedding_index):
    hits=0
    misses=[]

    # Pripremamo matricu za embedding
    embedding_matrix = np.zeros((num_tokens, embedding_dim))

    for word, i in tokenizer.word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # Rijeci koje nisu pronađene u embedding modelu će biti sve nule
            # Ovo uključuje reprezentaciju za "padding" i "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses.append(word)
    return embedding_matrix

In [51]:
embedding_dim=300
embedding_matrix_fasttext = embedding_matrix(num_of_tokens,embedding_dim,embedding_index_fasttext)

In [52]:
embedding_layer_fasttext = Embedding(
    input_dim=num_of_tokens,
    output_dim=embedding_dim,
    input_length=max_tweet_length,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix_fasttext),
    trainable=False,
)
lstm_model = Sequential()
#enbedding
lstm_model.add(embedding_layer_fasttext)
lstm_model.add(Dropout(0.5))
#LSTM
lstm_model.add(Bidirectional(LSTM(8,dropout=0.5,recurrent_dropout=0.2)))
# Vannila skriveni sloj
lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dense(16, activation='relu'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(units=1, activation='sigmoid',name='predictions'))

In [53]:
lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-3, epsilon=1e-08, clipnorm=1.0), 
              loss="binary_crossentropy",
              metrics=['accuracy'])



In [54]:
history = lstm_model.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=2)

Epoch 1/10
200/200 - 8s - loss: 0.6529 - accuracy: 0.6122 - val_loss: 0.6022 - val_accuracy: 0.6672 - 8s/epoch - 38ms/step
Epoch 2/10
200/200 - 6s - loss: 0.6106 - accuracy: 0.6745 - val_loss: 0.5496 - val_accuracy: 0.7275 - 6s/epoch - 32ms/step
Epoch 3/10
200/200 - 6s - loss: 0.5968 - accuracy: 0.6914 - val_loss: 0.5281 - val_accuracy: 0.7469 - 6s/epoch - 32ms/step
Epoch 4/10
200/200 - 6s - loss: 0.5889 - accuracy: 0.6973 - val_loss: 0.5183 - val_accuracy: 0.7428 - 6s/epoch - 32ms/step
Epoch 5/10
200/200 - 6s - loss: 0.5834 - accuracy: 0.7009 - val_loss: 0.5097 - val_accuracy: 0.7491 - 6s/epoch - 32ms/step
Epoch 6/10
200/200 - 7s - loss: 0.5713 - accuracy: 0.7102 - val_loss: 0.5202 - val_accuracy: 0.7447 - 7s/epoch - 33ms/step
Epoch 7/10
200/200 - 7s - loss: 0.5733 - accuracy: 0.7122 - val_loss: 0.5130 - val_accuracy: 0.7500 - 7s/epoch - 33ms/step
Epoch 8/10
200/200 - 7s - loss: 0.5687 - accuracy: 0.7116 - val_loss: 0.5192 - val_accuracy: 0.7475 - 7s/epoch - 35ms/step
Epoch 9/10
200/2

In [55]:
Qtest = tokenizer.texts_to_sequences(X_test)
Ptest = pad_sequences(Qtest,maxlen=max_tweet_length,padding='post' )
print("LSTM model evaluation with fasttext 300d embedding:")
print(lstm_model.evaluate(Ptest, y_test))

LSTM model evaluation with fasttext 300d embedding:
[0.5128079652786255, 0.7590000033378601]


In [158]:
lstm_model.save('outputs/lstm.h5')

  saving_api.save_model(


# 7. Google BERT

In [57]:
import os
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
tf.config.set_visible_devices([], 'GPU')

  from .autonotebook import tqdm as notebook_tqdm


In [58]:
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
def textCleaner(text):
    text = text.lower()
    text = removeMentionsAndHashTags(text)
    text = removeUrls(text)
    text = removeNumbers(text)
    return text

def preprocess(X):
    X_cleaned = [textCleaner(text) for text in X]
    return X_cleaned

In [60]:
def exampleConverter(text, tokenizer, max_length=128):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,  
        max_length=max_length, 
        pad_to_max_length=True,  
        return_attention_mask=True,
    )
    return encoding

def exampleToDict(input_ids, attention_mask, token_type_ids, label):
    return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_mask,
    }, label



In [61]:
def encodeExamples(X, y, tokenizer, max_length=128):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    
    for text, label in zip(X, y):
        bert_input = exampleConverter(text, tokenizer, max_length)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
        
    dataset = tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list))
    return dataset.map(exampleToDict)

In [None]:
#Podaci za treniranje
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
ds_train_encoded = encodeExamples(preprocess(X_train), y_train, bert_tokenizer).shuffle(100).batch(32).repeat(2)
ds_val_encoded = encodeExamples(preprocess(X_validation), y_validation, bert_tokenizer).batch(32)
ds_test_encoded = encodeExamples(preprocess(X_test), y_test, bert_tokenizer).batch(32)

In [63]:
learning_rate = 3e-5

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])



In [64]:
bert_model.fit(ds_train_encoded, epochs=2, validation_data=ds_val_encoded)

loss, acc = bert_model.evaluate(ds_test_encoded, verbose=0)
print("accuracy: {:5.2f}%".format(100 * acc))

Epoch 1/2


Epoch 2/2
accuracy: 74.70%


In [65]:
bert_model.save_pretrained("outputs/bert_model", saved_model=True)

INFO:tensorflow:Assets written to: outputs/bert_model/saved_model/1/assets


INFO:tensorflow:Assets written to: outputs/bert_model/saved_model/1/assets


In [67]:
bert_tokenizer.save_pretrained("outputs/bert_tokenizer")

('outputs/bert_tokenizer/tokenizer_config.json',
 'outputs/bert_tokenizer/special_tokens_map.json',
 'outputs/bert_tokenizer/vocab.txt',
 'outputs/bert_tokenizer/added_tokens.json')