# Dataset: sentiment-analysis-on-movie-reviews

The raw-data can be downloaded from http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data. </p>
First, let's explore the dataset using pandas. The columns of the dataset are tab-delimited. The dataset contains 156060 instances:

In [1]:
import os 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
# import nltk
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

In [2]:
def read_file(path):
    filename, extension = os.path.splitext(path)

    if extension == '.csv':
        rawdata = pd.read_csv(path, header=0, delimiter=',')
    elif extension == '.xlsx':
        rawdata = pd.read_excel(path, header=0)
    elif extension == '.tsv':
        rawdata = pd.read_csv(path, header=0, delimiter='\t')
    return rawdata

In [3]:
def preprocess_data(df):
    reviews = []
    for raw in tqdm(df['Phrase']):
        text = BeautifulSoup(raw, 'lxml').get_text()
        only_text = re.sub('[^a-zA-Z]', ' ', text)
        words = word_tokenize(only_text.lower())
        stops = set(stopwords.words('english'))
        non_stopwords = []
        #non_stopwords = [word for word in words if not word in stops]
        for word in words:
            if not word in stops:
                non_stopwords.append(word)
            else:
                pass
        lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
        reviews.append(lemma_words)
#         print("text  = ", text)
#         print("only = ", only_text)
#         print("words = ", words)
#         print("stops = ", stops)
#         print("non stops = ", non_stopwords)
#         print("lemma = ", lemma_words)
#         print("-------------------------------")
    return reviews

In [4]:
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

In [5]:
def report(predictions, y_test):
    print('Accuracy: %s' % accuracy_score(y_test, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))
    print('Classification Report:')
    print(classification_report(y_test, predictions))

In [6]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

In [7]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=5, split_criterion="entropy"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.split_criterion  = split_criterion 
        self.root = None
        self.classes = None
        self.deepest = 0
    
    def _build_tree(self, X, y, depth=0, parent_samples=0):
        self.n_samples, self.n_features = X.shape
        self.classes = len(np.unique(y))
        if(depth > self.deepest): self.deepest = depth
            
         # randomly select features to consider for split
        rnd_feats = np.random.choice(self.n_features, self.n_features, replace=False)
         # find best split based on selected features
        score, best_feat, best_thresh = self._best_split(X, y, rnd_feats)
        
         # Check if reach stop criteria
        if self._is_finished(depth, parent_samples) or score == 0:
            return self._leaf_value(y)
        
        # create children nodes and continue building tree recursively
        left_idx, right_idx = self._create_split(X[:, best_feat], best_thresh)
        left_child = self._build_tree(X[left_idx, :], y[left_idx], depth + 1, self.n_samples)
        right_child = self._build_tree(X[right_idx, :], y[right_idx], depth + 1, self.n_samples)
        return Node(best_feat, best_thresh, left_child, right_child)
    
    def _leaf_value(self, y):
        return Node(value=np.argmax(np.bincount(y)))
    
    def _best_split(self, X, y, features):
        split = {'score':- 1, 'feat': None, 'thresh': None}
        
        for feat in features:
            X_feat = X[:, feat]
            thresholds = np.unique(X_feat)
            for thresh in thresholds:
                score = self._information_gain(X_feat, y, thresh)
                
                if score > split['score']:
                    split['score'] = score
                    split['feat'] = feat
                    split['thresh'] = thresh
    
        return split['score'], split['feat'], split['thresh']
    
    def _is_finished(self, depth, parent_samples):
        """Check if stop to grow or not."""
        if (self.max_depth is not None and depth >= self.max_depth
            or self.classes == 1
            or self.n_samples < self.min_samples_split
            or self.n_samples == parent_samples
            or self.n_samples == 0):
            return True
        return False

    def _create_split(self, X, thresh):
        """Create a split in the data based on a given threshold."""
        left_idx = np.argwhere(X <= thresh).flatten()
        right_idx = np.argwhere(X > thresh).flatten()
        return left_idx, right_idx
    
    def _split_criterion(self, y):
        proportions = np.bincount(y) / len(y)
        if(self.split_criterion == 'gini'):
            value = 1 - np.sum([p*p for p in proportions if p > 0])
        else:
            value = -np.sum([p * np.log2(p) for p in proportions if p > 0])
        return value
    
    def _information_gain(self, X, y, thresh):
        """Calculate the information gain from splitting on a given feature and threshold."""
        left_idx, right_idx = self._create_split(X, thresh)
        n, n_left, n_right = len(y), len(left_idx), len(right_idx)

        if n_left == 0 or n_right == 0: 
                return 0
        parent_loss = self._split_criterion(y)
        child_loss = (n_left / n) * self._split_criterion(y[left_idx]) + (n_right / n) * self._split_criterion(y[right_idx])
        return parent_loss - child_loss

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def _predict_one(self, x, node):
        if node.is_leaf():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict_one(x, node.left)
        return self._predict_one(x, node.right)

    def predict(self, X):
        return [self._predict_one(x, self.root) for x in X]
    
    def getdeepest(self):
        return self.deepest

In [8]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [9]:
dfx = read_file('./Dataset2_train/X_train.xlsx')
dfy = read_file('./Dataset2_train/y_train.xlsx')
df = pd.concat([dfx, dfy], axis=1)
df = df.dropna()  #drop missing value(phrase)
df = df.reset_index(drop=True)

In [10]:
descriptive_feature = dfx.columns
target_feature = dfy.columns[0]
print(df['Sentiment'].value_counts())
print(df['Sentiment'].value_counts()/df['Sentiment'].count())

2    63664
3    26342
1    21818
4     7365
0     5658
Name: Sentiment, dtype: int64
2    0.509936
3    0.210994
1    0.174758
4    0.058992
0    0.045319
Name: Sentiment, dtype: float64


In [11]:
X, y = dfx.values, dfy.values
train_text = preprocess_data(df)
target = df.Sentiment.values
X_train, X_val, y_train, y_val = train_test_split(train_text, target, test_size=0.2, stratify=target)

100%|████████████████████████████████████████████████████████████████████████| 124847/124847 [01:21<00:00, 1536.83it/s]


In [12]:
X_train_, X_val_ = tokenizer_preprocess(X_train, X_val)  

100%|████████████████████████████████████████████████████████████████████████| 99877/99877 [00:00<00:00, 703953.87it/s]


In [13]:
clf = DecisionTree()
clf.fit(X_train_, y_train)
train_pred = clf.predict(X_train_)
train_acc = accuracy(train_pred, y_train)
y_pred = clf.predict(X_val_)
acc = accuracy(y_val, y_pred)
print("Train Accuracy:", train_acc)
report(y_pred, y_val)
print("pred:", np.bincount(y_pred))
print("label:", np.bincount(y_val))

Train Accuracy: 0.8381709502688307
Accuracy: 0.5350820985182219
Confusion Matrix:
[[ 377  403  216  106   30]
 [ 447 1776 1622  411  108]
 [ 323 1868 8804 1563  175]
 [ 146  563 2104 2029  426]
 [  42  130  314  612  375]]
Classification Report:
              precision    recall  f1-score   support

           0       0.28      0.33      0.31      1132
           1       0.37      0.41      0.39      4364
           2       0.67      0.69      0.68     12733
           3       0.43      0.39      0.41      5268
           4       0.34      0.25      0.29      1473

    accuracy                           0.54     24970
   macro avg       0.42      0.41      0.41     24970
weighted avg       0.53      0.54      0.53     24970

pred: [ 1335  4740 13060  4721  1114]
label: [ 1132  4364 12733  5268  1473]


In [14]:
testdf = read_file('./Dataset2_test/X_test.xlsx')
test_data = preprocess_data(testdf)
train, test = tokenizer_preprocess(train_text, test_data)
test_prediction = clf.predict(test)
output = pd.DataFrame (test_prediction)

filepath = '109550178_dataset2_pred.xlsx'
output.columns = ['Sentiment']
print(np.bincount(output['Sentiment']))
output.to_excel(filepath, index=False)

100%|██████████████████████████████████████████████████████████████████████████| 31212/31212 [00:20<00:00, 1505.25it/s]
100%|█████████████████████████████████████████████████████████████████████| 124847/124847 [00:00<00:00, 1002978.93it/s]


[ 1704  5885 16511  5666  1446]
