# Dataset: sentiment-analysis-on-movie-reviews

The raw-data can be downloaded from http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data. </p>
First, let's explore the dataset using pandas. The columns of the dataset are tab-delimited. The dataset contains 156060 instances:

In [66]:
import os 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
# import nltk
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

In [68]:
def read_file(path):
    filename, extension = os.path.splitext(path)

    if extension == '.csv':
        rawdata = pd.read_csv(path, header=0, delimiter=',')
    elif extension == '.xlsx':
        rawdata = pd.read_excel(path, header=0)
    elif extension == '.tsv':
        rawdata = pd.read_csv(path, header=0, delimiter='\t')
    return rawdata

In [53]:
def preprocess_data(df):
    reviews = []
    for raw in tqdm(df['Phrase']):
        text = BeautifulSoup(raw, 'lxml').get_text()
        only_text = re.sub('[^a-zA-Z]', ' ', text)
        words = word_tokenize(only_text.lower())
        stops = set(stopwords.words('english'))
        non_stopwords = []
        #non_stopwords = [word for word in words if not word in stops]
        for word in words:
            if not word in stops:
                non_stopwords.append(word)
            else:
                pass
        lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
        reviews.append(lemma_words)
#         print("text  = ", text)
#         print("only = ", only_text)
#         print("words = ", words)
#         print("stops = ", stops)
#         print("non stops = ", non_stopwords)
#         print("lemma = ", lemma_words)
#         print("-------------------------------")
    return reviews

In [54]:
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

In [55]:
def report(predictions, y_test):
    print('Accuracy: %s' % accuracy_score(y_test, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))
    print('Classification Report:')
    print(classification_report(y_test, predictions))

In [56]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

In [100]:
class DecisionTree:
    def __init__(self, max_depth=100, min_samples_split=300):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None
        self.classes = None
        
    def _is_finished(self, depth, parent_samples):
        if (depth >= self.max_depth
            or self.classes == 1
            or self.n_samples < self.min_samples_split
            or self.n_samples == parent_samples
            or self.n_samples == 0):
            return True
        return False
    
    def _entropy(self, y):
        proportions = np.bincount(y) / len(y)
        entropy = -np.sum([p * np.log2(p) for p in proportions if p > 0])
        return entropy
    
    def _gini_index(self, y):
        size = len(y)
        instances = [0] * len(np.unique(y))
        for label in y:
            instances[int(label)] += 1
        return 1 - np.sum([(val/size)**2 for val in instances]) if size > 0 else 100
    
    def _create_split(self, X, thresh):
        left_idx = np.argwhere(X <= thresh).flatten()
        right_idx = np.argwhere(X > thresh).flatten()
        return left_idx, right_idx

    def _information_gain(self, X, y, thresh):
        parent_loss = self._entropy(y)
        left_idx, right_idx = self._create_split(X, thresh)
        n, n_left, n_right = len(y), len(left_idx), len(right_idx)
        
        if n_left == 0 or n_right == 0: 
            return 0
        
        child_loss = (n_left / n) * self._entropy(y[left_idx]) + (n_right / n) * self._entropy(y[right_idx])
        return parent_loss - child_loss

    def _best_split(self, X, y, features):
        split = {'score':- 1, 'feat': None, 'thresh': None}
        
        for feat in features:
            X_feat = X[:, feat]
            thresholds = np.unique(X_feat)
            for thresh in thresholds:
                score = self._information_gain(X_feat, y, thresh)

                if score > split['score']:
#                     print(score, " > ", split['score'])
                    split['score'] = score
                    split['feat'] = feat
                    split['thresh'] = thresh

        return split['score'], split['feat'], split['thresh']
    
    def _build_tree(self, X, y, depth=0, parent_samples=0):
        #self.n_samples = len(X) 
        #self.n_features = len(X[0])
        self.n_samples, self.n_features = X.shape
        self.classes = len(np.unique(y))
        print(X.shape)
        

        # get best split
        rnd_feats = np.random.choice(self.n_features, self.n_features, replace=False)
        
        best_score, best_feat, best_thresh = self._best_split(X, y, rnd_feats)
        
        # stopping criteria
        if self._is_finished(depth, parent_samples) or best_score == 0:
            most_common_Label = np.argmax(np.bincount(y))
            return Node(value=most_common_Label)
        # grow children recursively
        left_idx, right_idx = self._create_split(X[:, best_feat], best_thresh)
        left_child = self._build_tree(X[left_idx, :], y[left_idx], depth + 1, self.n_samples)
        right_child = self._build_tree(X[right_idx, :], y[right_idx], depth + 1, self.n_samples)
        return Node(best_feat, best_thresh, left_child, right_child)
    
    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def predict(self, X):
        predictions = [self._traverse_tree(x, self.root) for x in X]
        return np.array(predictions)

In [63]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [92]:
dfx = read_file('./Dataset2_train/X_train.xlsx')
dfy = read_file('./Dataset2_train/y_train.xlsx')
# dfx = read_file('./Dataset1_train/train/X_train.xlsx')
# dfy = read_file('./Dataset1_train/train/y_train.xlsx')
df = pd.concat([dfx, dfy], axis=1)
df = df.dropna()
df = df.reset_index(drop=True)

In [101]:
descriptive_feature = dfx.columns
target_feature = dfy.columns[0]
print(descriptive_feature)
print(target_feature)
print(dfx.values)

Index(['Phrase'], dtype='object')
Sentiment
[['going to a house party and']
 ['a grand picture']
 ['lightweight meaning']
 ...
 ['Indian musical']
 [', you get a lot of running around , screaming and death .']
 ['Irish playwright , poet and drinker']]


In [102]:
X, y = dfx.values, dfy.values
train_text = preprocess_data(df)
target = df.Sentiment.values
X_train, X_val, y_train, y_val = train_test_split(train_text, target, test_size=0.2, stratify=target)

100%|████████████████████████████████████████████████████████████████████████| 124847/124847 [00:40<00:00, 3052.33it/s]


In [103]:
X_train_, X_val_ = tokenizer_preprocess(X_train, X_val)  

100%|████████████████████████████████████████████████████████████████████████| 99877/99877 [00:00<00:00, 897913.80it/s]


In [104]:
print(X_train[0])
print(X_train_[0])
print(len(X_train[0]))
print(len(X_train_[0]))
print(np.bincount(y_train))

['determined', 'face', 'needed', 'carry', 'dickensian', 'hero']
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0 1305  188 1282  884
 3645  363]
6
30
[ 4526 17454 50931 21074  5892]


In [105]:
clf = DecisionTree(max_depth=10)
clf.fit(X_train_, y_train)
y_pred = clf.predict(X_val_)
acc = accuracy(y_val, y_pred)
print("Accuracy:", acc)

(99877, 30)
depth :  0
class :  5
samples :  99877
parent samples :  0
(45897, 30)
depth :  1
class :  5
samples :  45897
parent samples :  99877
(22716, 30)
depth :  2
class :  5
samples :  22716
parent samples :  45897
(1419, 30)
depth :  3
class :  5
samples :  1419
parent samples :  22716
(1215, 30)
depth :  4
class :  4
samples :  1215
parent samples :  1419
(1132, 30)
depth :  5
class :  4
samples :  1132
parent samples :  1215
(948, 30)
depth :  6
class :  3
samples :  948
parent samples :  1132
(184, 30)
depth :  6
class :  4
samples :  184
parent samples :  948
--------------------------------
true
--------------------------------
(83, 30)
depth :  5
class :  3
samples :  83
parent samples :  184
--------------------------------
true
--------------------------------
(204, 30)
depth :  4
class :  5
samples :  204
parent samples :  83
--------------------------------
true
--------------------------------
(21297, 30)
depth :  3
class :  5
samples :  21297
parent samples :  204
(4

depth :  2
class :  5
samples :  23181
parent samples :  51
(14105, 30)
depth :  3
class :  5
samples :  14105
parent samples :  23181
(7079, 30)
depth :  4
class :  5
samples :  7079
parent samples :  14105
(146, 30)
depth :  5
class :  5
samples :  146
parent samples :  7079
--------------------------------
true
--------------------------------
(6933, 30)
depth :  5
class :  5
samples :  6933
parent samples :  146
(4452, 30)
depth :  6
class :  5
samples :  4452
parent samples :  6933
(584, 30)
depth :  7
class :  5
samples :  584
parent samples :  4452
(511, 30)
depth :  8
class :  5
samples :  511
parent samples :  584
(347, 30)
depth :  9
class :  4
samples :  347
parent samples :  511
(267, 30)
depth :  10
class :  4
samples :  267
parent samples :  347
--------------------------------
true
--------------------------------
(80, 30)
depth :  10
class :  3
samples :  80
parent samples :  267
--------------------------------
true
--------------------------------
(164, 30)
depth :  9

depth :  3
class :  5
samples :  9076
parent samples :  295
(6247, 30)
depth :  4
class :  5
samples :  6247
parent samples :  9076
(2041, 30)
depth :  5
class :  5
samples :  2041
parent samples :  6247
(465, 30)
depth :  6
class :  5
samples :  465
parent samples :  2041
(11, 30)
depth :  7
class :  2
samples :  11
parent samples :  465
--------------------------------
true
--------------------------------
(454, 30)
depth :  7
class :  5
samples :  454
parent samples :  11
(311, 30)
depth :  8
class :  5
samples :  311
parent samples :  454
(276, 30)
depth :  9
class :  5
samples :  276
parent samples :  311
--------------------------------
true
--------------------------------
(35, 30)
depth :  9
class :  5
samples :  35
parent samples :  276
--------------------------------
true
--------------------------------
(143, 30)
depth :  8
class :  5
samples :  143
parent samples :  35
--------------------------------
true
--------------------------------
(1576, 30)
depth :  6
class :  5
s

depth :  1
class :  5
samples :  53980
parent samples :  287
(30940, 30)
depth :  2
class :  5
samples :  30940
parent samples :  53980
(22757, 30)
depth :  3
class :  5
samples :  22757
parent samples :  30940
(10871, 30)
depth :  4
class :  5
samples :  10871
parent samples :  22757
(2064, 30)
depth :  5
class :  5
samples :  2064
parent samples :  10871
(1734, 30)
depth :  6
class :  5
samples :  1734
parent samples :  2064
(750, 30)
depth :  7
class :  5
samples :  750
parent samples :  1734
(733, 30)
depth :  8
class :  5
samples :  733
parent samples :  750
(72, 30)
depth :  9
class :  5
samples :  72
parent samples :  733
--------------------------------
true
--------------------------------
(661, 30)
depth :  9
class :  5
samples :  661
parent samples :  72
(648, 30)
depth :  10
class :  5
samples :  648
parent samples :  661
--------------------------------
true
--------------------------------
(13, 30)
depth :  10
class :  4
samples :  13
parent samples :  648
---------------

depth :  10
class :  5
samples :  2045
parent samples :  1250
--------------------------------
true
--------------------------------
(2311, 30)
depth :  8
class :  5
samples :  2311
parent samples :  2045
(2294, 30)
depth :  9
class :  5
samples :  2294
parent samples :  2311
(86, 30)
depth :  10
class :  5
samples :  86
parent samples :  2294
--------------------------------
true
--------------------------------
(2208, 30)
depth :  10
class :  5
samples :  2208
parent samples :  86
--------------------------------
true
--------------------------------
(17, 30)
depth :  9
class :  5
samples :  17
parent samples :  2208
--------------------------------
true
--------------------------------
(2503, 30)
depth :  5
class :  5
samples :  2503
parent samples :  17
(844, 30)
depth :  6
class :  5
samples :  844
parent samples :  2503
(614, 30)
depth :  7
class :  5
samples :  614
parent samples :  844
(33, 30)
depth :  8
class :  4
samples :  33
parent samples :  614
--------------------------

depth :  8
class :  5
samples :  1771
parent samples :  1878
(1308, 30)
depth :  9
class :  5
samples :  1308
parent samples :  1771
(1238, 30)
depth :  10
class :  5
samples :  1238
parent samples :  1308
--------------------------------
true
--------------------------------
(70, 30)
depth :  10
class :  3
samples :  70
parent samples :  1238
--------------------------------
true
--------------------------------
(463, 30)
depth :  9
class :  5
samples :  463
parent samples :  70
(402, 30)
depth :  10
class :  5
samples :  402
parent samples :  463
--------------------------------
true
--------------------------------
(61, 30)
depth :  10
class :  5
samples :  61
parent samples :  402
--------------------------------
true
--------------------------------
(107, 30)
depth :  8
class :  4
samples :  107
parent samples :  61
--------------------------------
true
--------------------------------
(572, 30)
depth :  5
class :  5
samples :  572
parent samples :  107
(20, 30)
depth :  6
class :

depth :  6
class :  5
samples :  1462
parent samples :  88
(1335, 30)
depth :  7
class :  5
samples :  1335
parent samples :  1462
(879, 30)
depth :  8
class :  5
samples :  879
parent samples :  1335
(111, 30)
depth :  9
class :  5
samples :  111
parent samples :  879
--------------------------------
true
--------------------------------
(768, 30)
depth :  9
class :  5
samples :  768
parent samples :  111
(727, 30)
depth :  10
class :  5
samples :  727
parent samples :  768
--------------------------------
true
--------------------------------
(41, 30)
depth :  10
class :  4
samples :  41
parent samples :  727
--------------------------------
true
--------------------------------
(456, 30)
depth :  8
class :  5
samples :  456
parent samples :  41
(390, 30)
depth :  9
class :  5
samples :  390
parent samples :  456
(49, 30)
depth :  10
class :  4
samples :  49
parent samples :  390
--------------------------------
true
--------------------------------
(341, 30)
depth :  10
class :  5
s

In [107]:
report(y_pred, y_val)
print(np.bincount(y_pred))
print(np.bincount(y_val))
print(np.unique(y))

Accuracy: 0.5158590308370044
Confusion Matrix:
[[   14   175   782   158     3]
 [   23   445  3542   344    10]
 [   17   408 11839   459    10]
 [   14   365  4307   570    12]
 [    6   117  1070   267    13]]
Classification Report:
              precision    recall  f1-score   support

           0       0.19      0.01      0.02      1132
           1       0.29      0.10      0.15      4364
           2       0.55      0.93      0.69     12733
           3       0.32      0.11      0.16      5268
           4       0.27      0.01      0.02      1473

    accuracy                           0.52     24970
   macro avg       0.32      0.23      0.21     24970
weighted avg       0.42      0.52      0.41     24970

[   74  1510 21540  1798    48]
[ 1132  4364 12733  5268  1473]
[0 1 2 3 4]
