# Dataset: sentiment-analysis-on-movie-reviews

The raw-data can be downloaded from http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data. </p>
First, let's explore the dataset using pandas. The columns of the dataset are tab-delimited. The dataset contains 156060 instances:

In [34]:
import os 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
# import nltk
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

In [35]:
def read_file(path):
    filename, extension = os.path.splitext(path)

    if extension == '.csv':
        rawdata = pd.read_csv(path, header=0, delimiter=',')
    elif extension == '.xlsx':
        rawdata = pd.read_excel(path, header=0)
    elif extension == '.tsv':
        rawdata = pd.read_csv(path, header=0, delimiter='\t')
    return rawdata

In [36]:
def preprocess_data(df):
    reviews = []
    for raw in tqdm(df['Phrase']):
        text = BeautifulSoup(raw, 'lxml').get_text()
        only_text = re.sub('[^a-zA-Z]', ' ', text)
        words = word_tokenize(only_text.lower())
        stops = set(stopwords.words('english'))
        non_stopwords = []
        #non_stopwords = [word for word in words if not word in stops]
        for word in words:
            if not word in stops:
                non_stopwords.append(word)
            else:
                pass
        lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
        reviews.append(lemma_words)
#         print("text  = ", text)
#         print("only = ", only_text)
#         print("words = ", words)
#         print("stops = ", stops)
#         print("non stops = ", non_stopwords)
#         print("lemma = ", lemma_words)
#         print("-------------------------------")
    return reviews

In [37]:
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

In [38]:
def report(predictions, y_test):
    print('Accuracy: %s' % accuracy_score(y_test, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))
    print('Classification Report:')
    print(classification_report(y_test, predictions))

In [39]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

In [40]:
class DecisionTree:
    def __init__(self, max_depth=100, min_samples_split=100, method="gini"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.method = method
        self.root = None
        self.classes = None

        
    def _is_finished(self, depth, parent_samples):
        if (depth >= self.max_depth
            or self.classes == 1
            or self.n_samples < self.min_samples_split
            or self.n_samples == parent_samples
            or self.n_samples == 0):
            return True
        return False
    
    def _entropy(self, y):
        proportions = np.bincount(y) / len(y)
        entropy = -np.sum([p * np.log2(p) for p in proportions if p > 0])
        return entropy
    
    def _gini_index(self, y):
        proportions = np.bincount(y) / len(y)
        gini = 1 - np.sum([p*p for p in proportions if p > 0])
        return gini
#         size = len(y)
#         instances = [0] * len(np.unique(y))
#         for label in y:
#             instances[int(label)] += 1
#         return 1 - np.sum([(val/size)**2 for val in instances]) if size > 0 else 100
    
    def _create_split(self, X, thresh):
        left_idx = np.argwhere(X <= thresh).flatten()
        right_idx = np.argwhere(X > thresh).flatten()
        return left_idx, right_idx

    def _information_gain(self, X, y, thresh):
        left_idx, right_idx = self._create_split(X, thresh)
        n, n_left, n_right = len(y), len(left_idx), len(right_idx)
#         print("left idx: ", left_idx)
#         print("right idx: ", right_idx)
#         print("n = ", n)
#         print("n_left = ", n_left)
#         print("n_right = ", n_right)
        if n_left == 0 or n_right == 0: 
                return 0
        if(self.method == "gini"):
            parent_loss = self._gini_index(y)
            child_loss = (n_left / n) * self._gini_index(y[left_idx]) + (n_right / n) * self._gini_index(y[right_idx])
        else:
            parent_loss = self._entropy(y)
            child_loss = (n_left / n) * self._entropy(y[left_idx]) + (n_right / n) * self._entropy(y[right_idx])
        return parent_loss - child_loss

    def _best_split(self, X, y, features):
        split = {'score':- 1, 'feat': None, 'thresh': None}
        
        for feat in features:
            X_feat = X[:, feat]
            thresholds = np.unique(X_feat)
            for thresh in thresholds:
                score = self._information_gain(X_feat, y, thresh)

                if score > split['score']:
#                     print(score, " > ", split['score'])
                    split['score'] = score
                    split['feat'] = feat
                    split['thresh'] = thresh

        return split['score'], split['feat'], split['thresh']
    
    def _build_tree(self, X, y, depth=0, parent_samples=0):
        #self.n_samples = len(X) 
        #self.n_features = len(X[0])
        self.n_samples, self.n_features = X.shape
        self.classes = len(np.unique(y))
        print(X.shape)
        

        # get best split
        rnd_feats = np.random.choice(self.n_features, self.n_features, replace=False)
        
        best_score, best_feat, best_thresh = self._best_split(X, y, rnd_feats)
        
        # stopping criteria
        if self._is_finished(depth, parent_samples) or best_score == 0:
            most_common_Label = np.argmax(np.bincount(y))
            return Node(value=most_common_Label)
        # grow children recursively
        left_idx, right_idx = self._create_split(X[:, best_feat], best_thresh)
        left_child = self._build_tree(X[left_idx, :], y[left_idx], depth + 1, self.n_samples)
        right_child = self._build_tree(X[right_idx, :], y[right_idx], depth + 1, self.n_samples)
        return Node(best_feat, best_thresh, left_child, right_child)
    
    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def predict(self, X):
        predictions = [self._traverse_tree(x, self.root) for x in X]
        return np.array(predictions)

In [41]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [42]:
dfx = read_file('./Dataset2_train/X_train.xlsx')
dfy = read_file('./Dataset2_train/y_train.xlsx')
# dfx = read_file('./Dataset1_train/train/X_train.xlsx')
# dfy = read_file('./Dataset1_train/train/y_train.xlsx')
df = pd.concat([dfx, dfy], axis=1)
df = df.dropna()
df = df.reset_index(drop=True)

In [43]:
descriptive_feature = dfx.columns
target_feature = dfy.columns[0]
print(descriptive_feature)
print(target_feature)
print(dfx.values)

Index(['Phrase'], dtype='object')
Sentiment
[['going to a house party and']
 ['a grand picture']
 ['lightweight meaning']
 ...
 ['Indian musical']
 [', you get a lot of running around , screaming and death .']
 ['Irish playwright , poet and drinker']]


In [44]:
X, y = dfx.values, dfy.values
train_text = preprocess_data(df)
target = df.Sentiment.values
X_train, X_val, y_train, y_val = train_test_split(train_text, target, test_size=0.2, stratify=target)

100%|████████████████████████████████████████████████████████████████████████| 124847/124847 [00:39<00:00, 3164.91it/s]


In [45]:
X_train_, X_val_ = tokenizer_preprocess(X_train, X_val)  

100%|███████████████████████████████████████████████████████████████████████| 99877/99877 [00:00<00:00, 1174364.20it/s]


In [46]:
print(X_train[0])
print(X_train_[0])
print(len(X_train[0]))
print(len(X_train_[0]))
print(np.bincount(y_train))

['sink']
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 1411]
1
30
[ 4526 17454 50931 21074  5892]


In [51]:
clf = DecisionTree(max_depth=10)
clf.fit(X_train_, y_train)
y_pred = clf.predict(X_val_)
acc = accuracy(y_val, y_pred)
print("Accuracy:", acc)

(99877, 30)
(45978, 30)
(22644, 30)
(1419, 30)
(1392, 30)
(1025, 30)
(944, 30)
(81, 30)
(367, 30)
(195, 30)
(172, 30)
(27, 30)
(21225, 30)
(3881, 30)
(39, 30)
(3842, 30)
(3590, 30)
(2520, 30)
(1705, 30)
(1574, 30)
(564, 30)
(1010, 30)
(131, 30)
(815, 30)
(140, 30)
(675, 30)
(11, 30)
(664, 30)
(1070, 30)
(12, 30)
(1058, 30)
(166, 30)
(892, 30)
(604, 30)
(288, 30)
(252, 30)
(17344, 30)
(5847, 30)
(113, 30)
(5734, 30)
(1542, 30)
(1201, 30)
(1182, 30)
(487, 30)
(695, 30)
(19, 30)
(341, 30)
(178, 30)
(163, 30)
(4192, 30)
(9, 30)
(4183, 30)
(4180, 30)
(107, 30)
(4073, 30)
(3, 30)
(11497, 30)
(8453, 30)
(66, 30)
(8387, 30)
(8143, 30)
(8137, 30)
(5, 30)
(8132, 30)
(6, 30)
(244, 30)
(3044, 30)
(12, 30)
(3032, 30)
(3018, 30)
(3015, 30)
(765, 30)
(2250, 30)
(3, 30)
(14, 30)
(23334, 30)
(16615, 30)
(172, 30)
(16443, 30)
(12441, 30)
(5201, 30)
(664, 30)
(583, 30)
(389, 30)
(291, 30)
(98, 30)
(194, 30)
(81, 30)
(4537, 30)
(2377, 30)
(122, 30)
(2255, 30)
(94, 30)
(2161, 30)
(2160, 30)
(770, 30)
(699,

In [52]:
report(y_pred, y_val)
print(np.bincount(y_pred))
print(np.bincount(y_val))


Accuracy: 0.5151381657989588
Confusion Matrix:
[[    9   147   742   232     2]
 [   11   384  3370   594     5]
 [   11   392 11611   706    13]
 [    8   297  4105   846    12]
 [    3   110   991   356    13]]
Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.01      0.02      1132
           1       0.29      0.09      0.13      4364
           2       0.56      0.91      0.69     12733
           3       0.31      0.16      0.21      5268
           4       0.29      0.01      0.02      1473

    accuracy                           0.52     24970
   macro avg       0.33      0.24      0.21     24970
weighted avg       0.43      0.52      0.42     24970

[   42  1330 20819  2734    45]
[ 1132  4364 12733  5268  1473]


In [53]:
clf = DecisionTree()
clf.fit(X_train_, y_train)
y_pred = clf.predict(X_val_)
acc = accuracy(y_val, y_pred)
print("Accuracy:", acc)

(99877, 30)
(45978, 30)
(22644, 30)
(1419, 30)
(1392, 30)
(1025, 30)
(944, 30)
(81, 30)
(367, 30)
(195, 30)
(172, 30)
(27, 30)
(21225, 30)
(3881, 30)
(39, 30)
(3842, 30)
(3590, 30)
(2520, 30)
(1705, 30)
(1574, 30)
(564, 30)
(539, 30)
(429, 30)
(336, 30)
(254, 30)
(82, 30)
(93, 30)
(110, 30)
(25, 30)
(1010, 30)
(638, 30)
(506, 30)
(104, 30)
(402, 30)
(83, 30)
(319, 30)
(91, 30)
(228, 30)
(132, 30)
(372, 30)
(114, 30)
(258, 30)
(131, 30)
(815, 30)
(140, 30)
(675, 30)
(11, 30)
(664, 30)
(136, 30)
(528, 30)
(302, 30)
(294, 30)
(8, 30)
(226, 30)
(1070, 30)
(12, 30)
(1058, 30)
(166, 30)
(892, 30)
(604, 30)
(461, 30)
(182, 30)
(279, 30)
(143, 30)
(288, 30)
(252, 30)
(17344, 30)
(5847, 30)
(113, 30)
(5734, 30)
(1542, 30)
(1201, 30)
(1182, 30)
(487, 30)
(341, 30)
(257, 30)
(84, 30)
(146, 30)
(695, 30)
(456, 30)
(431, 30)
(344, 30)
(337, 30)
(262, 30)
(75, 30)
(7, 30)
(87, 30)
(25, 30)
(239, 30)
(19, 30)
(341, 30)
(178, 30)
(163, 30)
(4192, 30)
(9, 30)
(4183, 30)
(4180, 30)
(107, 30)
(4073, 30)


(562, 30)
(506, 30)
(440, 30)
(7, 30)
(433, 30)
(63, 30)
(370, 30)
(13, 30)
(357, 30)
(66, 30)
(291, 30)
(66, 30)
(56, 30)
(45, 30)
(15, 30)
(43, 30)
(8, 30)
(1227, 30)
(1061, 30)
(542, 30)
(55, 30)
(487, 30)
(2, 30)
(485, 30)
(470, 30)
(387, 30)
(103, 30)
(284, 30)
(83, 30)
(15, 30)
(519, 30)
(55, 30)
(464, 30)
(439, 30)
(8, 30)
(431, 30)
(2, 30)
(429, 30)
(10, 30)
(419, 30)
(2, 30)
(417, 30)
(160, 30)
(257, 30)
(25, 30)
(166, 30)
(6719, 30)
(4123, 30)
(2580, 30)
(1372, 30)
(1338, 30)
(1028, 30)
(873, 30)
(823, 30)
(816, 30)
(491, 30)
(482, 30)
(167, 30)
(315, 30)
(87, 30)
(228, 30)
(9, 30)
(325, 30)
(35, 30)
(290, 30)
(7, 30)
(50, 30)
(155, 30)
(310, 30)
(5, 30)
(305, 30)
(246, 30)
(59, 30)
(34, 30)
(1208, 30)
(53, 30)
(1155, 30)
(2, 30)
(1153, 30)
(165, 30)
(988, 30)
(982, 30)
(9, 30)
(973, 30)
(730, 30)
(633, 30)
(482, 30)
(133, 30)
(349, 30)
(164, 30)
(185, 30)
(151, 30)
(97, 30)
(243, 30)
(6, 30)
(1543, 30)
(1034, 30)
(1028, 30)
(7, 30)
(1021, 30)
(396, 30)
(352, 30)
(331, 30)
(6

(7089, 30)
(4737, 30)
(3955, 30)
(2007, 30)
(1673, 30)
(1586, 30)
(997, 30)
(990, 30)
(137, 30)
(853, 30)
(319, 30)
(239, 30)
(80, 30)
(534, 30)
(444, 30)
(24, 30)
(420, 30)
(19, 30)
(401, 30)
(24, 30)
(377, 30)
(4, 30)
(373, 30)
(367, 30)
(254, 30)
(113, 30)
(6, 30)
(90, 30)
(7, 30)
(589, 30)
(584, 30)
(438, 30)
(56, 30)
(382, 30)
(296, 30)
(86, 30)
(146, 30)
(5, 30)
(87, 30)
(334, 30)
(71, 30)
(263, 30)
(1948, 30)
(1336, 30)
(1332, 30)
(1282, 30)
(85, 30)
(1197, 30)
(1191, 30)
(559, 30)
(551, 30)
(82, 30)
(469, 30)
(455, 30)
(427, 30)
(325, 30)
(61, 30)
(264, 30)
(102, 30)
(28, 30)
(14, 30)
(8, 30)
(632, 30)
(30, 30)
(602, 30)
(598, 30)
(6, 30)
(592, 30)
(39, 30)
(553, 30)
(455, 30)
(423, 30)
(54, 30)
(369, 30)
(213, 30)
(156, 30)
(32, 30)
(98, 30)
(4, 30)
(6, 30)
(50, 30)
(4, 30)
(612, 30)
(589, 30)
(29, 30)
(560, 30)
(86, 30)
(474, 30)
(353, 30)
(184, 30)
(169, 30)
(121, 30)
(23, 30)
(782, 30)
(217, 30)
(565, 30)
(141, 30)
(424, 30)
(364, 30)
(47, 30)
(317, 30)
(164, 30)
(153, 30)


In [50]:
# vectorizer = TfidfVectorizer()
# tfidf_text = vectorizer.fit_transform(train_x)


# vectorizer.fit(train_x)
# inputs = vectorizer.transform(train_x)

# dt = DecisionTree(max_depth=10)
# dt.fit(inputs, train_y)

# test_predict = dt.predict(test_x)
# report(test_predict, test_y)
# print(np.bincount(test_predict))
# print(np.bincount(test_y))
# # train_accuracy = round(dt.score(train_x, train_y)*100)
# # test_accuracy =round(accuracy_score(test_predict, test_y)*100)

# # print("Decision Tree Train Accuracy Score : {}% ".format(train_accuracy ))
# # print("Decision Tree Test Accuracy Score  : {}% ".format(test_accuracy ))
# # print()