# Dataset: sentiment-analysis-on-movie-reviews

The raw-data can be downloaded from http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data. </p>
First, let's explore the dataset using pandas. The columns of the dataset are tab-delimited. The dataset contains 156060 instances:

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
# import nltk
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

In [2]:
def read_file(path):
    rawdata = pd.read_csv(path, header=0, delimiter=',')
    return rawdata

In [3]:
def preprocess_data(df):
    reviews = []
    for raw in tqdm(df['Phrase']):
        text = BeautifulSoup(raw, 'lxml').get_text()
        only_text = re.sub('[^a-zA-Z]', ' ', text)
        words = word_tokenize(only_text.lower())
        stops = set(stopwords.words('english'))
        non_stopwords = [word for word in words if not word in stops]
        lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
        reviews.append(lemma_words)
    return reviews

In [4]:
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

In [5]:
def report(predictions, y_test):
    print('Accuracy: %s' % accuracy_score(y_test, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))
    print('Classification Report:')
    print(classification_report(y_test, predictions))

In [6]:
df = read_file('./Dataset2_train/X_train.csv')

In [7]:
print (df.head(5))

                                              Phrase  Sentiment
0                         going to a house party and          2
1                                    a grand picture          4
2                                lightweight meaning          1
3                                    most unpleasant          1
4  You can see the would-be surprises coming a mi...          1


In [8]:
print(df['Phrase'].head(10))

0                           going to a house party and
1                                      a grand picture
2                                  lightweight meaning
3                                      most unpleasant
4    You can see the would-be surprises coming a mi...
5    this too-extreme-for-TV rendition of the notor...
6                    wickedly undramatic central theme
7    ... a fascinating curiosity piece -- fascinati...
8              fallible human beings , not caricatures
9    is so prolonged and boring it is n't even clos...
Name: Phrase, dtype: object


In [9]:
print(df['Sentiment'].describe())

count    124848.000000
mean          2.063581
std           0.893844
min           0.000000
25%           2.000000
50%           2.000000
75%           3.000000
max           4.000000
Name: Sentiment, dtype: float64


In [10]:
print(df['Sentiment'].value_counts())

2    63665
3    26342
1    21818
4     7365
0     5658
Name: Sentiment, dtype: int64


In [11]:
print(df['Sentiment'].value_counts()/df['Sentiment'].count())

2    0.509940
3    0.210993
1    0.174757
4    0.058992
0    0.045319
Name: Sentiment, dtype: float64


In [12]:
X, y = df['Phrase'], df['Sentiment']
train_text = preprocess_data(df)
print(train_text)
target = df.Sentiment.values
X_train, X_val, y_train, y_val = train_test_split(train_text, target, test_size=0.2, stratify=target)

100%|████████████████████████████████████████████████████████████████████████| 124848/124848 [00:40<00:00, 3093.31it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




In [13]:
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        print(sent)
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

X_train_, X_val_ = tokenizer_preprocess(X_train, X_val)  

100%|███████████████████████████████████████████████████████████████████████| 99878/99878 [00:00<00:00, 1114432.97it/s]


In [14]:
X_train_, X_val_

(array([[    0,     0,     0, ...,    35,   373,   102],
        [    0,     0,     0, ...,     0,     0,  3081],
        [    0,     0,     0, ...,   294,   148,    14],
        ...,
        [    0,     0,     0, ...,     0,     0,  8851],
        [    0,     0,     0, ...,   171,   306,   177],
        [    0,     0,     0, ...,     0,     0, 11263]]),
 array([[    0,     0,     0, ...,   672,   111,   179],
        [    0,     0,     0, ...,     0,  6063,   800],
        [    0,     0,     0, ...,     0,     0, 13519],
        ...,
        [    0,     0,     0, ...,     0,   106,   147],
        [    0,     0,     0, ...,    39,   518,  8178],
        [    0,     0,     0, ...,     0,    19,     6]]))

In [15]:
model = DecisionTreeClassifier()
model.fit(X_train_, y_train)
predictions = model.predict(X_val_)
report(predictions, y_val)

Accuracy: 0.5531437725270324
Confusion Matrix:
[[ 411  392  209   86   34]
 [ 488 1810 1611  359   96]
 [ 303 1712 8937 1589  192]
 [ 115  452 2015 2191  495]
 [  48   97  257  608  463]]
Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.36      0.33      1132
           1       0.41      0.41      0.41      4364
           2       0.69      0.70      0.69     12733
           3       0.45      0.42      0.43      5268
           4       0.36      0.31      0.34      1473

    accuracy                           0.55     24970
   macro avg       0.44      0.44      0.44     24970
weighted avg       0.55      0.55      0.55     24970

