# Dataset: sentiment-analysis-on-movie-reviews

The raw-data can be downloaded from http://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data. </p>
First, let's explore the dataset using pandas. The columns of the dataset are tab-delimited. The dataset contains 156060 instances:

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
# import nltk
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

In [2]:
def read_file(path):
    rawdata = pd.read_excel(path, header=0)
    return rawdata

In [33]:
def preprocess_data(df):
    reviews = []
    for raw in tqdm(df['Phrase']):
#         print(raw)
#         if(raw == 'nan'):
#             print("got you")
        text = BeautifulSoup(raw, 'lxml').get_text()
#         print(text)
#         if(raw != text):
#             input()
#         if(raw == text):
#             print(1)
        only_text = re.sub('[^a-zA-Z]', ' ', text)
        words = word_tokenize(only_text.lower())
        stops = set(stopwords.words('english'))
        non_stopwords = [word for word in words if not word in stops]
        lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
        reviews.append(lemma_words)
    return reviews

In [4]:
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

In [5]:
def report(predictions, y_test):
    print('Accuracy: %s' % accuracy_score(y_test, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))
    print('Classification Report:')
    print(classification_report(y_test, predictions))

In [24]:
dfx = read_file('./Dataset2_train/X_train.xlsx')
dfy = read_file('./Dataset2_train/y_train.xlsx')
df = pd.concat([dfx, dfy], axis=1)
df = df.dropna()
df = df.reset_index(drop=True)

                                              Phrase  Sentiment
0                         going to a house party and          2
1                                    a grand picture          4
2                                lightweight meaning          1
3                                    most unpleasant          1
4  You can see the would-be surprises coming a mi...          1
                                              Phrase  Sentiment
0                         going to a house party and          2
1                                    a grand picture          4
2                                lightweight meaning          1
3                                    most unpleasant          1
4  You can see the would-be surprises coming a mi...          1
                                              Phrase  Sentiment
0                         going to a house party and          2
1                                    a grand picture          4
2                                lightwe

In [7]:
print (df.head(5))

                                              Phrase  Sentiment
0                         going to a house party and          2
1                                    a grand picture          4
2                                lightweight meaning          1
3                                    most unpleasant          1
4  You can see the would-be surprises coming a mi...          1


In [8]:
print(df['Phrase'].head(10))


0                           going to a house party and
1                                      a grand picture
2                                  lightweight meaning
3                                      most unpleasant
4    You can see the would-be surprises coming a mi...
5    this too-extreme-for-TV rendition of the notor...
6                    wickedly undramatic central theme
7    ... a fascinating curiosity piece -- fascinati...
8              fallible human beings , not caricatures
9    is so prolonged and boring it is n't even clos...
Name: Phrase, dtype: object


In [25]:
print(df['Sentiment'].describe())

count    124847.000000
mean          2.063582
std           0.893847
min           0.000000
25%           2.000000
50%           2.000000
75%           3.000000
max           4.000000
Name: Sentiment, dtype: float64


In [26]:
print(df['Sentiment'].value_counts())

2    63664
3    26342
1    21818
4     7365
0     5658
Name: Sentiment, dtype: int64


In [27]:
print(df['Sentiment'].value_counts()/df['Sentiment'].count())

2    0.509936
3    0.210994
1    0.174758
4    0.058992
0    0.045319
Name: Sentiment, dtype: float64


In [34]:
X, y = df['Phrase'], df['Sentiment']
train_text = preprocess_data(df)
target = df.Sentiment.values
X_train, X_val, y_train, y_val = train_test_split(train_text, target, test_size=0.2, stratify=target)

100%|████████████████████████████████████████████████████████████████████████| 124847/124847 [00:39<00:00, 3156.85it/s]


In [29]:
X_train_, X_val_ = tokenizer_preprocess(X_train, X_val)  

100%|███████████████████████████████████████████████████████████████████████| 99877/99877 [00:00<00:00, 1229555.57it/s]


In [30]:
X_train_, X_val_

(array([[   0,    0,    0, ...,    0,    0,  161],
        [   0,    0,    0, ...,   30,  850, 1022],
        [   0,    0,    0, ...,    0,    0,   43],
        ...,
        [   0,    0,    0, ...,  814, 7792,    7],
        [   0,    0,    0, ...,    0, 1132,  153],
        [   0,    0,    0, ..., 2667, 2221,   52]]),
 array([[    0,     0,     0, ...,     0,   592,   470],
        [    0,     0,     0, ...,     2,   737,  2582],
        [    0,     0,     0, ...,     0,     0,  5754],
        ...,
        [    0,     0,     0, ...,   861,   153,   462],
        [    0,     0,     0, ...,     0,    91,  4550],
        [    0,     0,     0, ...,     0, 11470,     9]]))

In [31]:
model = DecisionTreeClassifier()
model.fit(X_train_, y_train)
print(len(X_train_[0]))
print(len(X_train_))
predictions = model.predict(X_val_)
report(predictions, y_val)

30
99877
Accuracy: 0.5529034841810172
Confusion Matrix:
[[ 404  396  217   90   25]
 [ 503 1851 1513  405   92]
 [ 264 1747 8867 1638  217]
 [ 111  440 1999 2220  498]
 [  56  101  258  594  464]]
Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.36      0.33      1132
           1       0.41      0.42      0.42      4364
           2       0.69      0.70      0.69     12733
           3       0.45      0.42      0.43      5268
           4       0.36      0.32      0.34      1473

    accuracy                           0.55     24970
   macro avg       0.44      0.44      0.44     24970
weighted avg       0.55      0.55      0.55     24970



In [32]:
import numpy as np

print(np.bincount(predictions))
print(np.bincount(y_val))

[ 1338  4535 12854  4947  1296]
[ 1132  4364 12733  5268  1473]


In [37]:
import nltk
from keras.preprocessing.text import text_to_word_sequence
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

def TextPreprocessing(text):
    text=str(text)
    #remove single quotes
    text = text.replace("'","")
    #word tokenization using text-to-word-sequence
    tokenized_train_data = text_to_word_sequence(text,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',split=" ")
    #stop word removal 
    stop_words = set(stopwords.words('english'))
    stopwordremove = [i for i in tokenized_train_data if not i in stop_words]
    #join word into sent
    stopwordremove_text = ' '.join(stopwordremove)
    #remove Digit
    numberRemove = ''.join(num for num in stopwordremove_text if not num.isdigit())
#     print(numberRemove)
    stemmer = PorterStemmer()
    stem_input = nltk.word_tokenize(numberRemove)
    stem_text = ' '.join([stemmer.stem(word) for word in stem_input])
    return stem_text
#     lemmatizer = WordNetLemmatizer()


dfx = read_file('./Dataset2_train/X_train.xlsx')
dfy = read_file('./Dataset2_train/y_train.xlsx')
# dfx = read_file('./Dataset1_train/train/X_train.xlsx')
# dfy = read_file('./Dataset1_train/train/y_train.xlsx')
train_data = pd.concat([dfx, dfy], axis=1)
train_df = train_data[['Phrase','Sentiment']]
phrase = train_df['Phrase']
sentiment = train_df['Sentiment']

df = train_df['Phrase'].copy()
train_df['Phrase'] = train_df['Phrase'].apply(TextPreprocessing)

train_df['Phrase'].replace('',np.nan,inplace =True)
train_df.dropna(subset = ['Phrase'],inplace =True)

phrase = train_df['Phrase']
sentiment = train_df['Sentiment']

train_x,test_x,train_y,test_y = train_test_split(phrase,sentiment,test_size=0.2,random_state=2)

vectorizer = TfidfVectorizer()
tfidf_text = vectorizer.fit_transform(train_x)


dt = Pipeline([('tfidf', TfidfVectorizer()),
                ('dt', DecisionTreeClassifier()),
               ])

dt.fit(train_x, train_y)

test_predict = dt.predict(test_x)

train_accuracy = round(dt.score(train_x, train_y)*100)
test_accuracy =round(accuracy_score(test_predict, test_y)*100)

print("Decision Tree Train Accuracy Score : {}% ".format(train_accuracy ))
print("Decision Tree Test Accuracy Score  : {}% ".format(test_accuracy ))
print()

Decision Tree Train Accuracy Score : 88% 
Decision Tree Test Accuracy Score  : 61% 



In [38]:
report(test_predict, test_y)

Accuracy: 0.6119801139808415
Confusion Matrix:
[[ 484  474  116   42    9]
 [ 521 2184 1507  187   13]
 [ 162 1454 9393 1363   86]
 [  37  231 1984 2530  497]
 [  11   29  170  707  550]]
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.43      0.41      1125
           1       0.50      0.50      0.50      4412
           2       0.71      0.75      0.73     12458
           3       0.52      0.48      0.50      5279
           4       0.48      0.37      0.42      1467

    accuracy                           0.61     24741
   macro avg       0.52      0.51      0.51     24741
weighted avg       0.61      0.61      0.61     24741

