# Dataset: sentiment-analysis-on-movie-reviews

The raw-data can be downloaded from http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data. </p>
First, let's explore the dataset using pandas. The columns of the dataset are tab-delimited. The dataset contains 156060 instances:

In [29]:
import os 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
# import nltk
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

In [30]:
def read_file(path):
    filename, extension = os.path.splitext(path)

    if extension == '.csv':
        rawdata = pd.read_csv(path, header=0, delimiter=',')
    elif extension == '.xlsx':
        rawdata = pd.read_excel(path, header=0)
    elif extension == '.tsv':
        rawdata = pd.read_csv(path, header=0, delimiter='\t')
    return rawdata

In [31]:
def preprocess_data(df):
    reviews = []
    for raw in tqdm(df['Phrase']):
        text = BeautifulSoup(raw, 'lxml').get_text()
        only_text = re.sub('[^a-zA-Z]', ' ', text)
        words = word_tokenize(only_text.lower())
        stops = set(stopwords.words('english'))
        non_stopwords = []
        #non_stopwords = [word for word in words if not word in stops]
        for word in words:
            if not word in stops:
                non_stopwords.append(word)
            else:
                pass
        lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
        reviews.append(lemma_words)
#         print("text  = ", text)
#         print("only = ", only_text)
#         print("words = ", words)
#         print("stops = ", stops)
#         print("non stops = ", non_stopwords)
#         print("lemma = ", lemma_words)
#         print("-------------------------------")
    return reviews

In [32]:
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

In [33]:
def report(predictions, y_test):
    print('Accuracy: %s' % accuracy_score(y_test, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))
    print('Classification Report:')
    print(classification_report(y_test, predictions))

In [34]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

In [35]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=5, method="entropy"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.method = method
        self.root = None
        self.classes = None

    def _is_finished(self, depth, parent_samples):
        """Check if stop to grow or not."""
        if (self.max_depth is not None and depth >= self.max_depth
            or self.classes == 1
            or self.n_samples < self.min_samples_split
            or self.n_samples == parent_samples
            or self.n_samples == 0):
            return True
        return False

    def _entropy(self, y):
        """Calculate the entropy of a given set of labels."""
        proportions = np.bincount(y) / len(y)
        entropy = -np.sum([p * np.log2(p) for p in proportions if p > 0])
        return entropy

    def _gini_index(self, y):
        """Calculate the Gini index of a given set of labels."""
        proportions = np.bincount(y) / len(y)
        gini = 1 - np.sum([p*p for p in proportions if p > 0])
        return gini

    def _create_split(self, X, thresh):
        """Create a split in the data based on a given threshold."""
        left_idx = np.argwhere(X <= thresh).flatten()
        right_idx = np.argwhere(X > thresh).flatten()
        return left_idx, right_idx

    def _information_gain(self, X, y, thresh):
        """Calculate the information gain from splitting on a given feature and threshold."""
        left_idx, right_idx = self._create_split(X, thresh)
        n, n_left, n_right = len(y), len(left_idx), len(right_idx)

        if n_left == 0 or n_right == 0: 
                return 0
        if self.method == "gini":
            parent_loss = self._gini_index(y)
            child_loss = (n_left / n) * self._gini_index(y[left_idx]) + (n_right / n) * self._gini_index(y[right_idx])
        else:
            parent_loss = self._entropy(y)
            child_loss = (n_left / n) * self._entropy(y[left_idx]) + (n_right / n) * self._entropy(y[right_idx])
        return parent_loss - child_loss

    def _best_split(self, X, y, features):
        split = {'score':- 1, 'feat': None, 'thresh': None}
        
        for feat in features:
            X_feat = X[:, feat]
            thresholds = np.unique(X_feat)
            for thresh in thresholds:
                score = self._information_gain(X_feat, y, thresh)
                
                if score > split['score']:
                    split['score'] = score
                    split['feat'] = feat
                    split['thresh'] = thresh
    
        return split['score'], split['feat'], split['thresh']
    def _build_tree(self, X, y, depth=0, parent_samples=0):
        self.n_samples, self.n_features = X.shape
        self.classes = len(np.unique(y))
        
        rnd_feats = np.random.choice(self.n_features, self.n_features, replace=False)
        score, best_feat, best_thresh = self._best_split(X, y, rnd_feats)

        if self._is_finished(depth, parent_samples) or score == 0:
            most_common_Label = np.argmax(np.bincount(y))
            return Node(value=most_common_Label)
        left_idx, right_idx = self._create_split(X[:, best_feat], best_thresh)
        left_child = self._build_tree(X[left_idx, :], y[left_idx], depth + 1, self.n_samples)
        right_child = self._build_tree(X[right_idx, :], y[right_idx], depth + 1, self.n_samples)
        return Node(best_feat, best_thresh, left_child, right_child)

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def _predict_one(self, x, node):
        if node.is_leaf():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict_one(x, node.left)
        return self._predict_one(x, node.right)

    def predict(self, X):
        return [self._predict_one(x, self.root) for x in X]

In [36]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [37]:
dfx = read_file('./Dataset2_train/X_train.xlsx')
dfy = read_file('./Dataset2_train/y_train.xlsx')
# dfx = read_file('./Dataset1_train/train/X_train.xlsx')
# dfy = read_file('./Dataset1_train/train/y_train.xlsx')
df = pd.concat([dfx, dfy], axis=1)
df = df.dropna()  #drop missing value(phrase)
df = df.reset_index(drop=True)

In [38]:
descriptive_feature = dfx.columns
target_feature = dfy.columns[0]
print(df['Sentiment'].value_counts())
print(df['Sentiment'].value_counts()/df['Sentiment'].count())

2    63664
3    26342
1    21818
4     7365
0     5658
Name: Sentiment, dtype: int64
2    0.509936
3    0.210994
1    0.174758
4    0.058992
0    0.045319
Name: Sentiment, dtype: float64


In [39]:
X, y = dfx.values, dfy.values
train_text = preprocess_data(df)
target = df.Sentiment.values
X_train, X_val, y_train, y_val = train_test_split(train_text, target, test_size=0.2, stratify=target)

100%|████████████████████████████████████████████████████████████████████████| 124847/124847 [00:38<00:00, 3239.47it/s]


In [40]:
X_train_, X_val_ = tokenizer_preprocess(X_train, X_val)  

100%|███████████████████████████████████████████████████████████████████████| 99877/99877 [00:00<00:00, 1203345.06it/s]


In [41]:
print(X_train[0])
print(X_train_[0])
print(len(X_train[0]))
print(len(X_train_[0]))
print(len(X_train))
print(len(y_train))
print(y_train)
print(np.bincount(y_train))

['often', 'boring']
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0  90 373]
2
30
99877
99877
[1 2 3 ... 2 2 2]
[ 4526 17454 50931 21074  5892]


In [42]:
clf = DecisionTree()
clf.fit(X_train_, y_train)
y_pred = clf.predict(X_val_)
acc = accuracy(y_val, y_pred)
print("Accuracy:", acc)

Accuracy: 0.5369243091710052


In [43]:
report(y_pred, y_val)
print(np.bincount(y_pred))
print(np.bincount(y_val))

Accuracy: 0.5369243091710052
Confusion Matrix:
[[ 362  413  227   97   33]
 [ 499 1785 1578  426   76]
 [ 345 1878 8770 1519  221]
 [ 163  575 2013 2103  414]
 [  63  141  297  585  387]]
Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.32      0.28      1132
           1       0.37      0.41      0.39      4364
           2       0.68      0.69      0.68     12733
           3       0.44      0.40      0.42      5268
           4       0.34      0.26      0.30      1473

    accuracy                           0.54     24970
   macro avg       0.42      0.42      0.41     24970
weighted avg       0.54      0.54      0.54     24970

[ 1432  4792 12885  4730  1131]
[ 1132  4364 12733  5268  1473]


In [44]:
print(type(X_train))
print(type(y_train))
clf = DecisionTree(max_depth=100)
clf.fit(X_train_, y_train)
y_pred = clf.predict(X_val_)
acc = accuracy(y_val, y_pred)
print("Accuracy:", acc)

<class 'list'>
<class 'numpy.ndarray'>
Accuracy: 0.5372446936323588


In [45]:
report(y_pred, y_val)
print(np.bincount(y_pred))
print(np.bincount(y_val))

Accuracy: 0.5372446936323588
Confusion Matrix:
[[ 360  419  225   96   32]
 [ 496 1789 1583  420   76]
 [ 347 1875 8771 1523  217]
 [ 148  574 1998 2114  434]
 [  59  145  294  594  381]]
Classification Report:
              precision    recall  f1-score   support

           0       0.26      0.32      0.28      1132
           1       0.37      0.41      0.39      4364
           2       0.68      0.69      0.69     12733
           3       0.45      0.40      0.42      5268
           4       0.33      0.26      0.29      1473

    accuracy                           0.54     24970
   macro avg       0.42      0.42      0.41     24970
weighted avg       0.54      0.54      0.54     24970

[ 1410  4802 12871  4747  1140]
[ 1132  4364 12733  5268  1473]


In [46]:
testdf = read_file('./Dataset2_test/X_test.xlsx')
test_data = preprocess_data(testdf)
train, test = tokenizer_preprocess(train_text, test_data)
clf = DecisionTree()
clf.fit(train, target)
test_prediction = clf.predict(test)

output = pd.DataFrame (test_prediction)

filepath = 'dataset2_pred.xlsx'
output.columns = ['Sentiment']
print(np.bincount(output['Sentiment']))
output.to_excel(filepath, index=False)

100%|██████████████████████████████████████████████████████████████████████████| 31212/31212 [00:09<00:00, 3271.07it/s]
100%|█████████████████████████████████████████████████████████████████████| 124847/124847 [00:00<00:00, 2061112.62it/s]


[ 1730  5945 16165  5888  1484]
