# IMDB sentiment analysis with scikit-learn

## Fetch data

In [105]:
import tensorflow as tf
import numpy as np
import math 
from decimal import Decimal, getcontext

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=4000)

word_index = tf.keras.datasets.imdb.get_word_index()
index2word = dict((i + 3, word) for (word, i) in word_index.items())
index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'
x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])

In [106]:
len(word_index)

88584

In [107]:
x_train[0]

"[bos] this film was just brilliant casting location scenery story direction [oov] really suited the part they played and you could just imagine being there robert [oov] is an amazing actor and now the same being director [oov] father came from the same [oov] island as myself so i loved the fact there was a real connection with this film the witty [oov] throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for [oov] and would recommend it to everyone to watch and the fly [oov] was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also [oov] to the two little [oov] that played the [oov] of norman and paul they were just brilliant children are often left out of the [oov] list i think because the stars that play them all grown up are such a big [oov] for the whole film but these children are amazing and should be [oov] for what they have done 

## Alternative:

In [108]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xvf  'aclImdb_v1.tar.gz'

'wget' is not recognized as an internal or external command,
operable program or batch file.
tar: Error opening archive: Failed to open ''aclImdb_v1.tar.gz''


## Create the vocabulary

In [109]:
from collections import Counter

vocabulary = list()
train_words = list()
sorted_words = list()
for text in x_train:
  tokens = text.split()
  train_words.extend(tokens)

Counter = Counter(train_words)
Counter_copy = Counter
temp = Counter.most_common(3998)
for key in temp:
    sorted_words.append(key[0])

#n=99, m = 1000, k = 2898
k=list()
n=list()
m = Counter.most_common(1100)
j=0
for key in m:
  j+=1
  if(j>=101):
    vocabulary.append(key[0])
 

for i in range(1,100):
  n.append(sorted_words[i])
j=0
for key in sorted_words:
  j+=1
  if(j<=1100):
    sorted_words.remove(key)
k = sorted_words.copy()

print(len(vocabulary))

1000


## Create binary vectors 

In [110]:
from tqdm import tqdm

x_train_binary = list()
x_test_binary = list()

for text in tqdm(x_train):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_train_binary.append(binary_vector)

x_train_binary = np.array(x_train_binary)

for text in tqdm(x_test):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_test_binary.append(binary_vector)

x_test_binary = np.array(x_test_binary)
#print(x_test_binary[0])
y_train_list = y_train.tolist()
print(x_train_binary[0])

vocabulary_indexes = list()
for i in range(len(vocabulary)):
  vocabulary_indexes.append(i)

100%|██████████| 25000/25000 [01:25<00:00, 291.42it/s]
100%|██████████| 25000/25000 [01:25<00:00, 291.88it/s]


[0 1 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0
 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

## Naive Bayes classifier


In [151]:
class Naive_Bayes:

    def __init__(self):
        #Λίστες με τις πιθανότητες να έχουμε την κάθε λέξη δεδομένου πως έχουμε αρνητικό ή θετικό review αντίστοιχα
        self.x1_while_c_is_negative = list()
        self.x1_while_c_is_positive = list()
        #καθολικές μεταβλητές με την πιθανότητα να έχουμε αρνητικό review και θετικό review αντιστοιχα
        self.p_c0 = float(0)
        self.p_c1 = float(0)

    def fit(self, X, Y):
        #Initialisations of elements:
        self.x1_while_c_is_negative = []
        self.x1_while_c_is_positive = []
        reviews = len(Y) 
        neg_reviews = 0 
        pos_reviews = 0
        sum_pos =  list() #Σε καθε θέση i του πίνακα: Πόσες φορες εμφανίζεται η λεξη i ενώ έχουμε θετικό review
        sum_neg =  list() #Σε καθε θέση i του πίνακα: Πόσες φορες εμφανίζεται η λεξη i ενώ έχουμε αρνητικό review
        p_ex_pos = list() #Each element represents for the word Xelement the probability: P( Xelement = 1 | C = 1) 
        p_ex_neg = list() #Each element represents for the word Xelement the probability: P( Xelement = 1 | C = 0) 

        #Υπολογισμός της γενικής πιθανότητας να έχουμε θετικό ή αρνητικό review:
        for i in range(reviews):
            if Y[i] == 0:
                neg_reviews += 1
            else:
                pos_reviews += 1
        self.pc0 = pos_reviews/reviews
        self.pc1 = neg_reviews/reviews

        #Υπολογισμός πιθανοτήτων να έχουμε την κάθε λέξη δεδομένου πως έχουμε αρνητικό ή θετικό review αντίστοιχα:
        

        #Αρχικοποίηση λιστών με μετρητές
        for i in range(len(vocabulary)):
            #βάζουμε ήδη 1 για να αποφύγουμε το να μην υπάρχει κάν μια λέξη
            sum_pos.append(0) 
            sum_neg.append(0)

        for i in range(reviews):
            for j in range(len(vocabulary)):
                if(Y[i] == 0 and X[i][j]==1):
                    sum_neg[j] +=1
                elif(Y[i] == 1 and X[i][j]==1):
                    sum_pos[j] +=1
        

        #Λίστες πιθανοτήτων να έχουμε την κάθε λέξη δεδομένου πως έχουμε αρνητικό ή θετικό review 
        for i in range(0,1000):
            p_ex_pos.append(0) 
            p_ex_neg.append(0)
        #for i in range(len(p_ex_neg_train)):
        for i in range(len(vocabulary)):
            p_ex_neg[i] = (sum_neg[i]+1)/(neg_reviews+2) #P(Xi = 1 | C = 0) 
        for i in range(len(vocabulary)):
            p_ex_pos[i] = (sum_pos[i]+1)/(pos_reviews+2) #P(Xi = 1 | C = 1)

        p_ex_neg = np.round(p_ex_neg, decimals=2)
        p_ex_pos = np.round(p_ex_pos, decimals=2)

        self.x1_while_c_is_positive = p_ex_pos.copy()
        self.x1_while_c_is_negative = p_ex_neg.copy()

    def predict(self, X):

        # In Naive Bayes classification here we used logarithms to prevent numerical underflow when dealing with probabilities. 
        # The standard Naive Bayes equation is the following: 
        # P(Class∣Features) = P(Features∣Class) * P(Class) / P(Features)
        # The logarithmic transformation simplifies computations:
        # log(P(Class∣Features)) = log(P(Features∣Class)) + log(P(Class)) - log(P(Features))
        # This ensures numerical stability and precision in probabilistic models.


        predictions = list()
        for i in range(len(X)):

            # Initialize log probabilities
            log_pc0 = np.log(self.pc0)
            log_pc1 = np.log(self.pc1)
            pc0 = self.pc0
            pc1 = self.pc1

            # Calculate log probability for negative class (C=0)
            for xi in range(len(X[i])):
                if X[i][xi] == 0:
                    log_pc0 += np.log(1 - self.x1_while_c_is_negative[xi])
                    pc0 +=(1 - self.x1_while_c_is_negative[xi])
                else:
                    log_pc0 += np.log(self.x1_while_c_is_negative[xi])
                    pc0 += (self.x1_while_c_is_negative[xi])

            # Calculate log probability for positive class (C=1)
            for xi in range(len(X[i])):
                if X[i][xi] == 1:
                    log_pc1 += np.log(self.x1_while_c_is_positive[xi])
                    pc1 += (self.x1_while_c_is_positive[xi])
                else:
                    log_pc1 += np.log(1 - self.x1_while_c_is_positive[xi])
                    pc1 += (1 - self.x1_while_c_is_positive[xi])

            if log_pc0 < log_pc1:
                predictions.append(1)
            else:
                predictions.append(0)

        return predictions


In [152]:
tool = Naive_Bayes()
y_train_list = y_train.tolist()
y_test_list = y_test.tolist()
tool.fit(x_train_binary, y_train_list)
y_pred = tool.predict(x_train_binary)
sum=0
for i in range(len(y_train_list)):
    if(y_train_list[i]==y_pred[i]):
        sum+=1
correct_percentage_test = (sum/len(y_train_list))*100
print(correct_percentage_test) 

# sum = 0
# p1 = tool.x1_while_c_is_negative.copy()
# p2 = tool.x1_while_c_is_positive.copy()
tool2 = Naive_Bayes()
# tool2.fit(x_test_binary, y_test_list)
y_pred = tool.predict(x_test_binary)
sum=0
for i in range(len(y_train_list)):
    if(y_test_list[i]==y_pred[i]):
        sum+=1
correct_percentage_test = (sum/len(y_train_list))*100
print(correct_percentage_test) 

print(sum)

82.948
82.588
20647


In [155]:
from sklearn.metrics import classification_report
nb = Naive_Bayes()
nb.fit(x_train_binary, y_train)

# Using Naive Bayes Classifier
y = nb.predict(x_test_binary)
print(classification_report(y_test, y))

              precision    recall  f1-score   support

           0       0.84      0.80      0.82     12500
           1       0.81      0.85      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



## ID3 Classifier

In [27]:
def IG(Y, Xi):
    
    pc0=list(Y).count(0)/len(Y) #P(C=0) 
    pc1=list(Y).count(1)/len(Y) #P(C=1)

    #H(C) = -P(C=0)*log2(P(C=0)) -P(C=1)*log2(P(C=1))
    Hc= -(pc0) * (math.log(pc0, 2)) -(pc1) * (math.log(pc1, 2))

    #P(Xi = 1)*H(C|Xi=1) + P(Xi = 0)*H(C|Xi=0)
    sum_calculations = 0 
    for digit in range(0,2):

        #P(Xi = digit)
        p = list(Xi).count(digit)/len(Xi)

        #H(C|Xi=digit)
        Y_Xi = list()
        for i in range(len(Xi)):
            if Xi[i]==digit:
                Y_Xi.append(Y[i])
        if(len(Y_Xi)!=0):
            for digit2 in range(0,2):
                #H(C=digit2|Xi=digit)
                pc = Y_Xi.count(digit2)/len(Y_Xi)
                if(pc != 0):
                    sum_calculations += p * pc * np.log2(pc)

    #IG(Y,Xi)= H(C) - P(Xi = 1)*H(C|Xi=1) + P(Xi = 0)*H(C|Xi=0)
    ig = Hc - sum_calculations
    return ig




class Tree():
    def __init__(self):
        self.word = "no word yet" #Η λέξη με την οποία θα έγινε το classification ενός υπόδεντρου
        self.tag = None #1 αν ο κόμβος έχει reviews με τη λέξη με την οποία έγινε το classification, 0 αν δεν την έχουν
        self.children = list() #τα παιδία ενός κόμβου
        self.classification = int #Η τελική classification. Παίρνει τιμή μόνο αν έχει γίνει 
    
    def new_child(self, node):
        self.children.append(node)

class ID3():
    def __init__(self, max_depth = 10):
        self.max_depth = max_depth
        self.depth = 0

    def most_IG(self, X, Y, vocabulary):

        max_gain = -1
        max_word= -1


        for w in vocabulary:
            x_word = list() #Λίστα με όλες τις τιμές που θα πάρει μια λέξη στον Χ
            for ex in range(len(X)):
                x_word.append(X[ex][w])
            word_ig = IG(Y, x_word) #Στέλνουμε το Υ και την λίστα στον ΙG, για να βρει το informtion gain της λέξης ανάλογικά με το Υ

            if(word_ig>max_gain):
                max_gain = word_ig
                max_word = w

        return max_word #Η λέξη με το μέγιστο Information Gain

    def fit(self, X, Y, vocabulary, default):
       
        if(len(Y) == 0):
            #αν φτάσαμε εδώ τελείωσαν τα Υ γιατί τελείωσε η κατάταξη κάθε review στο δέντρο.
            #Παίρνει για τιμή του classification εκείνη που επικρατούσε στο παραπάνω επίπεδο του δέντρο
            node = Tree()
            node.classification = default 
            return node 

        if(len(set(Y)) == 1):
            #Η μέθοδος set επιστρέφει ένα set με όλες τις διαφορετικές τιμές που περιλαμβάνει το όρισμα της, εδώ το Υ
            #Άρα φτάσαμε εδώ αν το Υ έχει μόνο μια τιμη, η οποία θα χρησιμοποιηθεί στο classification και σταματάει η διαδικασία. 
            node = Tree()
            node.classification = Y[0]
            return node

        if(len(vocabulary) == 0):
            #Αν φτάσαμε εδώ χρησιμοποιήσαμε όλες τις λέξεις οπότε δεν γίνονται παραπάνω κατατάξεις.
            #Η διαδικασία σταματάει και γίνεται classified με την τιμή που επικρατεί στα Y
            node = Tree()
            if(Y.count(0)>Y.count(1)):
                max_count = 0
            else:
                max_count = 1
            node.classification = max_count
            return node

        if (self.depth == self.max_depth):
            #Αν είμαστε εδώ φτάσαμε το max depth του δέντρου
            #Η διαδικασία σταματάει και γίνεται classified με την τιμή που επικρατεί στα Y. 
            #Αν έχουμε ισοπαλία αρνητικών θετικών reviews παίρνουμε το default, δηλαδή αυτή που επικρατούσε στο παραπάνω επίπεδο
            yes = True
            node = Tree()
            if((Y.count(0))>Y.count(1)):
                node.classification = 0
            elif((Y.count(0))<Y.count(1)):
                node.classification = 1
            else:
                node.classification =default
            return node

        #Σταματάει η διαδικασία αν υπερτερεί στα εναπομείναντα reviews είτε το 0 είτε το 1
        if (float(Y.count(1))/float(len(Y))>= 0.95):
            node = Tree()
            node.classification = 1                   
            return node
        
        if(float(Y.count(0))/float(len(Y))>= 0.95):
            node = Tree()
            node.classification = 0
            return node

        #Αποθήκευση του clssification που επικρατεί μέχρι στιγμης ώστε να περασθεί ως default στα παρακάτω επίπεδα
        if(Y.count(1)>Y.count(0)):
            max_count = 1
        else:
            max_count = 0


        best_word = self.most_IG(X, Y, vocabulary) #Εύρεση της λέξης με το μέγιστο Information Gain 
        tree = Tree() #Αρχικοποίηση υπόδεντρου

        #το νεο λεξιλόγιο, χωρίς την λέξη που θα χρσιμοποιηθέι για τον διαχωρισμό σε φύλλα τωρα ωστέ να μην ξαναχρησιμοποιηθεί μετά
        new_vocabulary = vocabulary.copy() 
        new_vocabulary.remove(best_word)
        self.depth += 1 #ενημέρωση του depth

        for zero_or_one in range(2):
            # if(zero_or_one==0):
            #     print(len(new_vocabulary))
            #Oι νέες λίστες reviews, δημιουργούνται 2 για κάθε κατηγορία(Υ και Χ) λόγω της for, μια με τη best_word και μία χωρις 
            x_new = list()
            y_new = list()
            for i in range(len(X)):
                if X[i][best_word] == zero_or_one:
                    x_new.append(X[i])
                    y_new.append(Y[i])
            subtree = self.fit(x_new, y_new, new_vocabulary, max_count)
            subtree.tag = zero_or_one 
            subtree.word = best_word
            tree.new_child(subtree)            
                
        return tree

    # def predict_sample(self, x_sample, tree):
    #     decided = False
    #     sub_tree = tree
    #     while not decided:
    #         feature = sub_tree.children[0].feature
    #         for sub in sub_tree.children:
    #             if sub.tag == x_sample[feature]:
    #                 sub_tree = sub
    #         if (sub_tree.decision == 1 or sub_tree.decision == 0):
    #             decided = True
    #     return sub_tree.decision

    def singular_prediction(self, X, tree):
        sub_tree = tree #Αρχικοποίηση Υπόδεντρου
        flag = False
        while not flag:
            word_feature = sub_tree.children[0].word #Παίρνουμε τη λέξη με την οποία έγινε ο διαχωρισμός
            for sub in sub_tree.children:
                if (sub.tag == X[word_feature]): 
                    #Αν υπάρχει η λέξη-κριτήριο με την οποία έγινε ο διαχωρισμος σε αυτο το επίπεδο στο sample που κοιταμε 
                    # πάμε στο υπόδεντρο οπόυ το tag είναι 1(δηλ. έχει reviews που την περιλαμβανουν), αλλιώς σε αυτο που ειναι 0
                    sub_tree = sub
            if(sub_tree.classification == 1 or sub_tree.classification == 0):
                #Σταματάμε αν φτάσουμε σε κάποιο φυλλο. Τα φύλλα έχουν τιμή 0 ή 1 και αυτο καταλήγει να ειναι το classification
                # του sample. Οι άλλοι ενδιάμεσοι κόμβοι έχουν None στο classification
                flag = True
        return sub_tree.classification

    def predict(self, tree, X):
        y_pred = list()
        for i in range(len(X)):
            y_pred.append(self.singular_prediction(X[i], tree)) #πρόβλεψη για κάθε review ξεχωριστά
        
        return y_pred

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(x_train_binary, y_train)
y_pred = dt.predict(x_test_binary)
y_test_list = y_test.tolist()
sum=0
for i in range(len(y_test)):
    if(y_test_list[i]==y_pred[i]):
        sum+=1
correct_percentage_test = (sum/len(y_test_list))*100
print(correct_percentage_test)
#print(classification_report(y_train, dt.predict(x_train_binary)))

70.372


In [29]:
# clf = ID3()
# tree = clf.fit(x_train_binary, y_train, list(range(len(x_train_binary[0]))), 1)
# prediction = clf.predict(tree, x_test_binary)
#meta th fit:
#y_pred = model.predict(trained_tree, x_train_binary)
# y_train_list = y_train.tolist()
# sum=0
# for i in range(len(y_train)):
#     if(y_train_list[i]==y_pred[i]):
#         sum+=1
# correct_percentage_train = (sum/len(y_train_list))*100
# print(correct_percentage_train)

# print(len(vocabulary_indexes))
model = ID3(400)
y_train_list = y_train.tolist()
y_test_list = y_test.tolist()
trained_tree = model.fit(x_train_binary, y_train_list, vocabulary_indexes, 0)

y_pred = model.predict(trained_tree, x_train_binary)
sum=0
for i in range(len(y_test)):
    if(y_train_list[i]==y_pred[i]):
        sum+=1
correct_percentage_test = (sum/len(y_test_list))*100
print(correct_percentage_test)

#trained_tree = model.fit(x_test_binary, y_test_list, vocabulary_indexes, 0)
y_pred = model.predict(trained_tree, x_test_binary)
y_test_list = y_test.tolist()
sum=0
for i in range(len(y_test)):
    if(y_test_list[i]==y_pred[i]):
        sum+=1
correct_percentage_test = (sum/len(y_test_list))*100
print(correct_percentage_test)

50.0
50.0


In [62]:
print(classification_report(y_train, dt.predict(x_train_binary)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12500
           1       1.00      1.00      1.00     12500

    accuracy                           1.00     25000
   macro avg       1.00      1.00      1.00     25000
weighted avg       1.00      1.00      1.00     25000



In [45]:
print(classification_report(y_test, dt.predict(x_test_binary)))

              precision    recall  f1-score   support

           0       0.71      0.71      0.71     12500
           1       0.71      0.71      0.71     12500

    accuracy                           0.71     25000
   macro avg       0.71      0.71      0.71     25000
weighted avg       0.71      0.71      0.71     25000



## Random Forest classifier

In [30]:
import random

class Random_Forest():
    def __init__(self, num_of_words, trees = 10):
        self.num_of_words = num_of_words #Αριθμός των λέξεων
        self.trees = trees #Αριθμός των δέντρων που θα φτιαχθούν
        self.forest = list() #Λίστα Δέντων

    def new_sample(self, X, Y):
        #Αρχικοποίηση των νέων x και y
        x_new = list()
        y_new = list()

        y_indexes = list() #Tα indexes των reviews που δεν έχουν επιλεχθεί
        for i in range(len(Y)):
            y_indexes.append(i)

        for i in range(len(X)):
            #Τυχαία επιλογή reviews για το υποσύνολο που θα επιστρέψει η μέθοδος, χρησιμοποιώντας τα indexes που φτιάχτηκε πάνω
            random_choice = random.choice(y_indexes) 
            x_new.append(X[random_choice])
            y_new.append(Y[random_choice])

        return x_new, y_new

    def new_vocabulary(self, X):
        #Λίστα με τα indexes του λεξιλογιου για τυχαία επιλογή των νέων λέξεων του νέου λεξιλογίου που επιστρέφει η μέθοδος 
        words_indexes = list()
        for x in range(len(X[0])):
            words_indexes.append(x)

        new_words = list()
        for i in range(self.num_of_words):
            random_word = random.choice(words_indexes) #Tυχαία επιλογή λέξης
            words_indexes.remove(random_word) #Αφαίρεση από το παλιό λεξιλόγιο
            new_words.append(random_word) #Εισαγωγή στο καινούριο

        return new_words

    def fit(self, X, Y, max_depth = 10):
        for i in range(self.trees):
            id3 = ID3(max_depth) #Δημιουργία id3 δέντρου
            random_x, random_y = self.new_sample(X, Y)
            tree = id3.fit(random_x, random_y, self.new_vocabulary(random_x), 0)
            self.forest.append(tree)

    def predict(self, X):
        y_pred = list()
        for i in range(len(X)):
            zeros =0
            ones = 0
            for j in range(self.trees):
                id3 = ID3()
                prediction = id3.singular_prediction(X[i], self.forest[j])
                if (prediction == 1):
                    ones += 1
                elif(prediction==0):
                    zeros +=1
            if ones>zeros:
                y_pred.append(1)
            else:
                y_pred.append(0)
        return y_pred

In [31]:
model = Random_Forest(len(vocabulary))
trained_forest = model.fit(x_train_binary, y_train_list)

y_pred = model.predict(x_test_binary)
y_test_list = y_test.tolist()
sum=0
for i in range(len(y_test)):
    if(y_test_list[i]==y_pred[i]):
        sum+=1
correct_percentage_test = (sum/len(y_test_list))*100
print(correct_percentage_test)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\interactiveshell.py", line 3442, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4896\4269910413.py", line 2, in <module>
    trained_forest = model.fit(x_train_binary, y_train_list)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4896\3730876292.py", line 44, in fit
    tree = id3.fit(random_x, random_y, self.new_vocabulary(random_x), 0)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4896\462271505.py", line 145, in fit
    subtree = self.fit(x_new, y_new, new_vocabulary, max_count)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4896\462271505.py", line 145, in fit
    subtree = self.fit(x_new, y_new, new_vocabulary, max_count)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4896\462271505.py", line 145, in fit
 

In [47]:
print(classification_report(y_test, rf.predict(x_test_binary)))

              precision    recall  f1-score   support

           0       0.84      0.85      0.84     12500
           1       0.84      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



## AdaBoost classifier
An AdaBoost classifier is a meta-estimator that begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases.

In [50]:
# Decision stump used as weak classifier
class DecisionStump():
    def __init__(self):
        self.polarity = 1
        self.feature_idx = None
        self.threshold = None
        self.alpha = None

    def predict(self, X):
        n_samples =np.shape(X) 
        print(n_samples)
        X_column = X[:, self.feature_idx]
        predictions = list()
        for i in range(len(n_samples)):
            predictions.append(1)
        if self.polarity == 1:
            for i in range(len(n_samples)):
                if(X_column[i] < self.threshold):
                    predictions[i] = 0
        else:
            for i in range(len(n_samples)):
                if(X_column[i] > self.threshold):
                    predictions[i] = 0

        return predictions


class Adaboost():

    def __init__(self, n_clf=5):
        self.n_clf = n_clf

    def fit(self, X, y):
        n_samples = len(X)
        n_features = len(X)
        print(n_features)
        # Initialize weights to 1/N
        #w = np.full(n_samples, (1 / n_samples)) kanei to idio
        w = list()
        for i in range(n_samples):
            w.append(1/n_samples)

        self.clfs = list()
        # Iterate through classifiers
        for _ in range(self.n_clf):
            clf = DecisionStump()

            min_error = float(1000000000000)
            # greedy search to find best threshold and feature
            #for feature_j in 
            for feature_i in range(n_features):
                X_column = X[feature_i]
                print(len(X_column)) 
                #print(feature_i)
                thresholds = np.unique(X_column)
                #print(X_column)
                #print(X)

                for threshold in thresholds:
                    # predict with polarity 1
                    p = 1
                    predictions = list()
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1
                    #for i in range(n_samples):
                    #    predictions.append(1)
                    #for i in range(n_samples):
                    #    if(X_column < threshold):
                    #        predictions[i] = 0

                    print(predictions)
                    # Error = sum of weights of misclassified samples
                    misclassified = list()
                    for i in range(n_samples):
                        if (predictions[i]!=y):
                            misclassified.append(w[i])
                    error = 0
                    for i in range(len(misclassified)):
                        error += misclassified[i]

                    if error > 0.5:
                        error = 1 - error
                        p = -1

                    # store the best configuration
                    if error < min_error:
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_idx = feature_i
                        min_error = error

            # calculate alpha
            EPS = 1e-10
            clf.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS))

            # calculate predictions and update weights
            predictions = clf.predict(X)

            w *= np.exp(-clf.alpha * y * predictions)
            # Normalize to one
            w /= np.sum(w)

            # Save classifier
            self.clfs.append(clf)

    def predict(self, X):
        clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
        y_pred = np.sum(clf_preds, axis=0)
        y_pred = np.sign(y_pred)

        return y_pred

In [51]:
def accuracy(y_true, y_pred):
    sum=0
    for i in range(len(y_true)):
        if(y_true[i]==y_pred[i]):
            sum+=1
    correct_percentage_test = (sum/y_true)*100
    return correct_percentage_test

In [52]:
ada = Adaboost()
ada.fit(x_train_binary, y_train)
y_pred = ada.predict(x_test_binary)

acc = accuracy(y_train, y_pred)
print(acc)

25000
1000


IndexError: boolean index did not match indexed array along dimension 0; dimension is 25000 but corresponding boolean dimension is 1000

In [50]:
print(classification_report(y_test, ab.predict(x_test_binary)))

              precision    recall  f1-score   support

           0       0.83      0.77      0.80     12500
           1       0.79      0.84      0.81     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

