>Dataset: [Turkish_Movie_Sentiment](https://www.win.tue.nl/~mpechen/projects/smm/)

>It consists of 5331 positive and 5331 negative movie reviews in Turkish.

###Load Dataset

In [1]:
!curl https://www.win.tue.nl/~mpechen/projects/smm/Turkish_Movie_Sentiment.zip -o ./Turkish_Movie_Sentiment.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  520k  100  520k    0     0   604k      0 --:--:-- --:--:-- --:--:--  604k


In [2]:
!unzip /content/Turkish_Movie_Sentiment.zip -d /content

Archive:  /content/Turkish_Movie_Sentiment.zip
  inflating: /content/tr_polarity.neg  
  inflating: /content/tr_polarity.pos  


In [3]:
# Positive Reviews
with open("./tr_polarity.pos", "rb") as f:
  review_pos_list = f.read().decode("iso-8859-9").replace("\r", "").split("\n")

# Negative Reviews
with open("./tr_polarity.neg", "rb") as f:
  review_neg_list = f.read().decode("iso-8859-9").replace("\r", "").split("\n")

In [4]:
print(f"First 5 positive reviews of total {len(review_pos_list)}:\n ", *review_pos_list[:5], sep="\n")
print(f"\n\nFirst 5 negative review of total {len(review_neg_list)}:\n ", *review_neg_list[:5], sep="\n")

First 5 positive reviews of total 5332:
 
gerçekten harika bir yapim birçok kez izledim gene izlerim özgürlük askini ve ingilizlerin ne kadar vahset olduklarini gözler önüne seren bir film ve tabi ki ask.... 
her izledigimde hayranlik duydugum gerçek klasik diyebilecegimiz filmlerden . içinde teknik hatalar barindirsa bile sinema olgusunun en üst noktalarindan.. 
gerçekten tarihi savas filmleri arasinda tartismasiz en iyisi , 12 yil boyunca acaba ikincisi çekirimi diye bekledigim bir film ,belki william wallace babasinin ölümünden sonra amcasi yanina almisti onu yetistirmisti belki bunu anlatan mükkemmel bir filim olablilr=). 
aldigi ödülleri sonuna dek hak eden muhtesem bir basyapit . 
özgürlük denilince aklima gelen ilk film.bir basyapit.. 


First 5 negative review of total 5332:
 
giseye oynayan bir film.mel gibson'in oyunculugu yine çok kötü.film bastan sona duygu sömürüsü ama anlayan nerde!. 
bircok yonden sahip olduklari zayifliklari populerligi iyi kullanmasiyla gidermis zayif 

In [5]:
import pandas as pd

In [6]:
# Convert reviews to pandas dataframe
dataset = pd.DataFrame([[1, prev] for prev in review_pos_list if prev] + [[0, nrev] for nrev in review_neg_list if nrev] , columns=["label", "review"])
dataset

Unnamed: 0,label,review
0,1,gerçekten harika bir yapim birçok kez izledim ...
1,1,her izledigimde hayranlik duydugum gerçek klas...
2,1,gerçekten tarihi savas filmleri arasinda tarti...
3,1,aldigi ödülleri sonuna dek hak eden muhtesem b...
4,1,özgürlük denilince aklima gelen ilk film.bir b...
...,...,...
10656,0,"yarisina bile gelmeden sikilip biraktim,murat ..."
10657,0,rezalet bir senaryo rezalet oyunculuklar(tuba ...
10658,0,nerden bulmuslar böyle yönetmeni oyuncuyu bast...
10659,0,konu:bilindik senaryo:basit kurgu:çakma geriye...


###Tokenization, Punctuations, Stop Words, Case Folding

> Tokeinizing, removing punctuations and stopword, applying case folding.

In [7]:
# nltk library for tokenizing, punctuations and stopwords
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
from nltk.corpus import stopwords;
nltk.download("stopwords");
stop_words = stopwords.words("turkish");

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
print(len(stop_words), stop_words)

53 ['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani']


In [10]:
from nltk.tokenize import word_tokenize
import re
# Tokenizing, removing punctuations and stop words, applying case folding
for idx, row in dataset.iterrows():
    dataset.at[idx, "review"] = [token.lower() for token in word_tokenize(re.sub(r"\.", " . ", row["review"]))  if token.isalpha() and (token.lower() not in stop_words)]

###Train-Test Splits

In [11]:
# Train Dataset
train_dataset = dataset.sample(frac=0.8, random_state=1)

In [12]:
train_dataset

Unnamed: 0,label,review
1777,1,"[iyi, film, noluyor, lan, demekten, filme, kon..."
6818,0,"[das, experimenti, yeniden, çekmisler, gerek, ..."
1305,1,"[yaw, herhangi, bir, kanal, filmi, yayinlasa, ..."
6106,0,"[filmi, dün, gece, izledim, bekledigim, çikmad..."
1185,1,"[tom, hanks, senaryo, iyi, film, zaten, asmisss]"
...,...,...
7062,0,"[acayip, zorlama, bir, film, olmus, puani, pop..."
4690,1,"[dvd, sini, nerdeyse, hafta, önce, izledim, sa..."
6497,0,"[serinin, iyi, filmiydi, serinin, ilk, filmi, ..."
10558,0,"[seyrettim, foktan, filimdi, gidip, korsanini,..."


In [13]:
len(train_dataset)

8529

In [14]:
train_dataset["label"].value_counts()

1    4275
0    4254
Name: label, dtype: int64

In [15]:
# Test Dataset
test_dataset = dataset.drop(train_dataset.index)

In [16]:
test_dataset

Unnamed: 0,label,review
0,1,"[gerçekten, harika, bir, yapim, birçok, izledi..."
2,1,"[gerçekten, tarihi, savas, filmleri, arasinda,..."
15,1,"[önce, bir, çikarip, izledim, senedir, arada, ..."
18,1,"[iskoçyanin, evlatlari, baslayip, özgürlügü, a..."
20,1,"[kesinlikle, kaçirilmamasi, gerekne, filmlerin..."
...,...,...
10639,0,"[hafta, sonu, izledim, basrol, oyuncusunun, ka..."
10644,0,"[yakinda, apoya, film, çekilirse, sasirmam]"
10647,0,"[kendileri, inanacakki, karsiyida, inandirarak..."
10650,0,"[oldu, olacak, oscara, aday, gösterelim, filmm..."


In [17]:
len(test_dataset)

2132

In [18]:
test_dataset["label"].value_counts()

0    1076
1    1056
Name: label, dtype: int64

### Build Vocabulary

In [19]:
total_tokens = 0
token_set = set()
for _, tokens in train_dataset["review"].items():
  total_tokens += len(tokens)
  for token in tokens:
    token_set.add(token)

# Add <UNKNOWN> to token set for occurence of words that are not in corpus.
token_set.add("<UNKNOWN>")

In [20]:
print(f"Corpus size: {total_tokens}, Vocabulary size: {len(token_set)}\n")
print("First 10 tokens:")
i = 0
for token in token_set:
  if i >= 10:
    break
  print(token)
  i += 1

Corpus size: 145979, Vocabulary size: 25227

First 10 tokens:
getirecegini
zirva
bolt
vazgeçin
gidersiniz
vermemek
hop
uyamak
beklicemm
kacirmamak


###Vectorization

In [21]:
# token-to-index mapping
token2idx = {}
for idx, term in enumerate(token_set):
  token2idx.update({term:idx})
print(token2idx)

{'getirecegini': 0, 'zirva': 1, 'bolt': 2, 'vazgeçin': 3, 'gidersiniz': 4, 'vermemek': 5, 'hop': 6, 'uyamak': 7, 'beklicemm': 8, 'kacirmamak': 9, 'degilmis': 10, 'vermeyin': 11, 'dalina': 12, 'halkanin': 13, 'denemeyi': 14, 'içinse': 15, 'isaya': 16, 'zorluklar': 17, 'seyredin': 18, 'okunmadan': 19, 'yasantidan': 20, 'yetiskinler': 21, 'okdar': 22, 'yapsin': 23, 'rage': 24, 'bogulmus': 25, 'kalici': 26, 'karsiyasiniz': 27, 'entellikle': 28, 'sonlu': 29, 'büyüyünce': 30, 'meslegi': 31, 'vaktiniz': 32, 'talihsizlikleri': 33, 'igreç': 34, 'fransizca': 35, 'bulenk': 36, 'mükkemmelddiiiiiiiiiiiiiiii': 37, 'kalan': 38, 'tutsagi': 39, 'kevin': 40, 'ustad': 41, 'biografinin': 42, 'oynadigin': 43, 'uzakdogu': 44, 'minicik': 45, 'gizemlilik': 46, 'vurma': 47, 'kardestirler': 48, 'dolduramaz': 49, 'sevmemek': 50, 'laik': 51, 'yazmaktansa': 52, 'tuba': 53, 'tesüflerimi': 54, 'hakimdi': 55, 'bitmemeis': 56, 'çikamiyor': 57, 'angels': 58, 'dakikacik': 59, 'birbirlerinden': 60, 'nakit': 61, 'ötesiydi

In [22]:
# Length of each review token wise
train_dataset["length"] = train_dataset["review"].apply(lambda r: len(r))

In [23]:
train_dataset

Unnamed: 0,label,review,length
1777,1,"[iyi, film, noluyor, lan, demekten, filme, kon...",8
6818,0,"[das, experimenti, yeniden, çekmisler, gerek, ...",14
1305,1,"[yaw, herhangi, bir, kanal, filmi, yayinlasa, ...",27
6106,0,"[filmi, dün, gece, izledim, bekledigim, çikmad...",9
1185,1,"[tom, hanks, senaryo, iyi, film, zaten, asmisss]",7
...,...,...,...
7062,0,"[acayip, zorlama, bir, film, olmus, puani, pop...",10
4690,1,"[dvd, sini, nerdeyse, hafta, önce, izledim, sa...",22
6497,0,"[serinin, iyi, filmiydi, serinin, ilk, filmi, ...",33
10558,0,"[seyrettim, foktan, filimdi, gidip, korsanini,...",31


In [24]:
import numpy as np

In [25]:
vocab_size = len(token_set)
number_of_reviews = train_dataset.shape[0]

# number of reviews that tokens appears in for each token
token_DFs = [0] * vocab_size

# matrix of number of occurences for each token in each review
# token_by_review[token_index, document_index]
token_by_review = np.zeros((vocab_size, number_of_reviews))

for idx, (_, row) in enumerate(train_dataset.iterrows()):

  # DF
  review_token_set = set(row["review"])
  for token in review_token_set:
    token_idx = token2idx[token]
    token_DFs[token_idx] += 1

  # TF
  for token in row["review"]:
    token_idx = token2idx[token]
    token_by_review[token_idx, idx] += 1

In [26]:
print(token_DFs)
print(token_by_review)

[1, 2, 1, 3, 1, 2, 1, 1, 1, 1, 6, 8, 1, 1, 1, 2, 1, 1, 35, 1, 1, 1, 1, 6, 1, 2, 2, 1, 1, 1, 2, 1, 24, 1, 2, 1, 1, 1, 36, 1, 33, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 3, 1, 1, 5, 1, 2, 1, 8, 1, 3, 1, 4, 3, 1, 1, 1, 1, 1, 1, 1, 2, 18, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 9, 1, 3, 2, 1, 3, 1, 1, 1, 1, 5, 1, 3, 1, 1, 4, 4, 6, 1, 1, 1, 8, 1, 1, 2, 1, 1, 53, 5, 4, 1, 1, 1, 1, 1, 1, 1, 5, 4, 1, 5, 4, 1, 18, 1, 1, 24, 15, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 76, 1, 2, 2, 7, 2, 1, 1, 1, 1, 1, 2, 10, 1, 1, 2, 3, 1, 2, 1, 1, 1, 1, 1, 2, 2, 19, 5, 2, 1, 2, 2, 1, 26, 2, 10, 1, 1, 1, 1, 1, 2, 1, 1, 2, 17, 1, 1, 1, 2, 3, 4, 1, 1, 1, 1, 10, 1, 3, 1, 1, 14, 1, 1, 7, 23, 1, 1, 3, 1, 1, 1, 5, 1, 1, 1, 1, 4, 5, 1, 1, 1, 1, 3, 2, 1, 1, 3, 1, 1, 5, 1, 1, 2, 1, 1, 25, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 26, 1, 1, 2, 1, 1, 1, 1, 3, 6, 1, 1, 1, 61, 1, 3, 2, 4, 1, 3, 12, 4, 6, 1, 1, 93, 1, 1, 1, 2, 1, 4, 1, 2, 1, 1, 9, 9, 1, 1, 1, 1, 3, 1, 1, 1, 3, 1, 2, 1

In [27]:
import math

def get_tfidf(token_idx, review_idx,):
  tf = token_by_review[token_idx, review_idx]
  idf = math.log(number_of_reviews / token_DFs[token_idx])
  return tf * idf

In [28]:
review_vectors = []
for idx, (_, row) in enumerate(train_dataset.iterrows()):
  review_vector = [0] * vocab_size
  review_tokens = set(row["review"])
  for token in review_tokens:
    token_idx = token2idx[token]
    review_vector[token_idx] += get_tfidf(token_idx, idx)
  review_vectors.append([row["label"], review_vector])

train_vector_df = pd.DataFrame(review_vectors, columns=["label", "vector"])

In [29]:
train_vector_df

Unnamed: 0,label,vector
0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
8524,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8525,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8526,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8527,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


###Model Training

In [30]:
import numpy as np

# Vectors for positive & negative sentiments
V = len(token_set)
pos_vector = np.array([0.0] * V) # Initial positive sentiment vector
neg_vector = np.array([0.0] * V) # Initial negative sentiment vector

for _, row in train_vector_df.iterrows():
  vector = np.array(row["vector"])
  if row["label"] == 0:
      neg_vector += vector
  else:
      pos_vector += vector

In [31]:
print(f"positive sentiment vector of size: {len(pos_vector)}\n", pos_vector[:100])
print(f"negative sentiment vector of size: {len(neg_vector)}\n", neg_vector[:100])

positive sentiment vector of size: 25227
 [  9.0512274    0.           0.           0.           9.0512274
  16.71616044  18.1024548    9.0512274    9.0512274    9.0512274
  29.03787172   6.97178586   9.0512274    0.           0.
   0.           0.           9.0512274  153.88462149   0.
   9.0512274    0.           9.0512274    0.           0.
   0.           8.35808022   0.           0.           9.0512274
  16.71616044   9.0512274   17.61952071   9.0512274    0.
   0.           0.           9.0512274   65.61250154   0.
 138.86799597   9.0512274    9.0512274    9.0512274    8.35808022
   9.0512274    9.0512274    9.0512274    9.0512274    9.0512274
   9.0512274   16.71616044   0.           0.           0.
   0.           9.0512274    0.           9.0512274    0.
   0.           0.           7.95261511   9.0512274    9.0512274
   9.0512274    9.0512274    0.           7.10531725   9.0512274
   0.           0.           9.0512274    0.          23.85784533
   9.0512274    0.          14

###Similarity Measure: Cosine

In [32]:
def cosine_sim(vector1, vector2):
  dot_product = np.dot(vector1, vector2)  # dot product of the vectors
  magnitude1 = np.linalg.norm(vector1)    # length of vector1
  magnitude2 = np.linalg.norm(vector2)    # length of vector2

  cosine_similarity = dot_product / (magnitude1 * magnitude2)
  return cosine_similarity

###Testing and Result

In [33]:
# Document vectors of Test Set
test_vector_df = None
vocab_size = len(token_set)
test_doc_vectors = []
for _, row in test_dataset.iterrows():
  vector = [0] * vocab_size # initial vector
  for token in row["review"]:
    idx = token2idx.get(token, token2idx.get("<UNKNOWN>"))
    vector[idx] += 1
  test_doc_vectors.append([row["label"], vector])

test_vector_df = pd.DataFrame(test_doc_vectors, columns= ["label", "vector"])

In [34]:
test_vector_df

Unnamed: 0,label,vector
0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
2127,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2128,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2129,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2130,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [35]:
test_labels, test_vectors  = test_vector_df.iloc[:,0], test_vector_df.iloc[:,1]
test_labels.value_counts()

0    1076
1    1056
Name: label, dtype: int64

In [36]:
# make all predictions
predicted_labels = []
for vector in test_vectors:
  pos_sim = cosine_sim(np.array(vector), pos_vector)
  neg_sim = cosine_sim(np.array(vector), neg_vector)
  predicted_labels.append((1 if pos_sim > neg_sim else 0))

  cosine_similarity = dot_product / (magnitude1 * magnitude2)


In [37]:
# Check predictions
len(predicted_labels)

2132

In [38]:
# check if prediction made correctly or not for each prediction
total_predictions = len(predicted_labels)
correct_predictions = 0
pos_pos = 0 # predicted pos, it is pos   (correct prediction)
neg_neg = 0 # predicted neg, it is neg   (correct prediction)
pos_neg = 0 # predicted pos, it is neg   (uncorrect prediction)
neg_pos = 0 # predicted neg, it is pos   (uncorrect prediction)
for idx ,prediction in enumerate(predicted_labels):
  label = test_labels[idx]

  if prediction and label:
    pos_pos += 1
  elif prediction and not label:
    pos_neg += 1
  elif not prediction and not label:
    neg_neg += 1
  else:
    neg_pos += 1

  correct_predictions += 1 if prediction == test_labels[idx] else 0

print(f"Success rate: {correct_predictions / total_predictions:.4g}")
print(f"\nPrediction\tLabel")
print(f"positive\tpositive\t{pos_pos / total_predictions:.4g}")
print(f"negative\tnegative\t{neg_neg / total_predictions:.4g}")
print(f"positive\tnegative\t{pos_neg / total_predictions:.4g}")
print(f"negative\tpositive\t{neg_pos / total_predictions:.4g}")

Success rate: 0.834

Prediction	Label
positive	positive	0.4447
negative	negative	0.3893
positive	negative	0.1154
negative	positive	0.05066
