# Dataset: sentiment-analysis-on-movie-reviews

The raw-data can be downloaded from http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data. </p>
First, let's explore the dataset using pandas. The columns of the dataset are tab-delimited. The dataset contains 156060 instances:

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
# import nltk
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

In [3]:
def read_file(path):
    rawdata = pd.read_csv(path, header=0, delimiter='\t')
    return rawdata

In [4]:
def preprocess_data(df):
    reviews = []
    for raw in tqdm(df['Phrase']):
        text = BeautifulSoup(raw, 'lxml').get_text()
        only_text = re.sub('[^a-zA-Z]', ' ', text)
        words = word_tokenize(only_text.lower())
        stops = set(stopwords.words('english'))
        non_stopwords = [word for word in words if not word in stops]
        lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
        reviews.append(lemma_words)
    return reviews

In [5]:
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = sequence.pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = sequence.pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

In [6]:
def report(predictions, y_test):
    print('Accuracy: %s' % accuracy_score(y_test, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))
    print('Classification Report:')
    print(classification_report(y_test, predictions))

In [7]:
df = read_file('./dataset/train.tsv')

FileNotFoundError: [Errno 2] No such file or directory: './dataset/train.tsv'

In [None]:
print (df.head(5))

   PhraseId  SentenceId                                             Phrase  \
0         1           1  A series of escapades demonstrating the adage ...   
1         2           1  A series of escapades demonstrating the adage ...   
2         3           1                                           A series   
3         4           1                                                  A   
4         5           1                                             series   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  


In [None]:
print(df['Phrase'].head(10))

0    A series of escapades demonstrating the adage ...
1    A series of escapades demonstrating the adage ...
2                                             A series
3                                                    A
4                                               series
5    of escapades demonstrating the adage that what...
6                                                   of
7    escapades demonstrating the adage that what is...
8                                            escapades
9    demonstrating the adage that what is good for ...
Name: Phrase, dtype: object


In [None]:
print(df['Sentiment'].describe())

count    156060.000000
mean          2.063578
std           0.893832
min           0.000000
25%           2.000000
50%           2.000000
75%           3.000000
max           4.000000
Name: Sentiment, dtype: float64


In [None]:
print(df['Sentiment'].value_counts())

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64


In [None]:
print(df['Sentiment'].value_counts()/df['Sentiment'].count())

2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64


In [None]:
X, y = df['Phrase'], df['Sentiment']
train_text = preprocess_data(df)
target = df.Sentiment.values
X_train, X_val, y_train, y_val = train_test_split(train_text, target, test_size=0.2, stratify=target)

100%|████████████████████████████████████████████████████████████████████████| 156060/156060 [01:19<00:00, 1973.55it/s]


In [None]:
X_train_, X_val_ = tokenizer_preprocess(X_train, X_val)  

100%|█████████████████████████████████████████████████████████████████████| 124848/124848 [00:00<00:00, 1150116.77it/s]


In [None]:
X_train_, X_val_

(array([[   0,    0,    0, ...,    0, 2755, 1373],
        [   0,    0,    0, ...,    4,  116,    2],
        [   0,    0,    0, ...,    0,  189,  496],
        ...,
        [   0,    0,    0, ...,  501,    6, 1105],
        [   0,    0,    0, ...,  102,  542,  322],
        [   0,    0,    0, ...,  308, 1482, 1446]]),
 array([[    0,     0,     0, ...,     0,     0,    75],
        [    0,     0,     0, ..., 10764,   175,    68],
        [    0,     0,     0, ...,     0,   300,   290],
        ...,
        [    0,     0,     0, ...,   239,  1923,     3],
        [    0,     0,     0, ...,  3142,   320,   529],
        [    0,     0,     0, ...,     0,     0,   381]]))

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train_, y_train)
predictions = dt.predict(X_val_)
report(predictions, y_val)