In [25]:
import pandas as pd
import numpy as np
import nltk
import contractions
import re
from sklearn.metrics import classification_report, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data Visualization

# Data Preprocessing

Preprocessing includes:

1. Converting to lower case (already in lowercase)
2. Removing punctuations (check out below)
3. Removing Numbers (check out below)
4. Removing whitespaces (no extra whitespaces)
5. Removing stopwords (check out below)
6. Using Lemmatizer (check out below)

In [26]:
with open('rt-polaritydata/rt-polarity.neg', 'r', encoding='ISO-8859-1') as f:
    neg_reviews = f.readlines()

with open('rt-polaritydata/rt-polarity.pos', 'r', encoding='ISO-8859-1') as f:
    pos_reviews = f.readlines()

neg_reviews = pd.DataFrame(neg_reviews, columns=['text'])
pos_reviews = pd.DataFrame(pos_reviews, columns=['text'])
neg_reviews['label'] = 0
pos_reviews['label'] = 1
reviews = pd.concat([neg_reviews, pos_reviews], ignore_index=True)

In [27]:
reviews.head()

Unnamed: 0,text,label
0,"simplistic , silly and tedious . \n",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


In [28]:
print(reviews.shape)

(10662, 2)


In [29]:
reviews.isna().sum()

text     0
label    0
dtype: int64

## Tokenization and lemmatization

In [30]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
stop_words.remove('not')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/btech/2021/shreya.malik21b/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/btech/2021/shreya.malik21b/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/btech/2021/shreya.malik21b/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
def expand_contractions(text):
    return contractions.fix(text)

In [32]:
import re
def pre_processing(rev):
    rev = expand_contractions(rev)
    rev = re.sub(r'[^\w\s]', '', rev)  # Remove punctuation
    rev = re.sub(r'\d+', '', rev)      # Remove numbers
    token = word_tokenize(rev)
    filtered_token = [word for word in token if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    filtered_token_after_lemm = [lemmatizer.lemmatize(word) for word in filtered_token]
    return ' '.join(filtered_token_after_lemm)

In [33]:
print("before preprocessing:\t"+reviews['text'][2])

before preprocessing:	exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 



In [34]:
print("after preprocessing:\t"+pre_processing(reviews['text'][2]))

after preprocessing:	exploitative largely devoid depth sophistication would make watching graphic treatment crime bearable


In [35]:
reviews['preprocessed_review'] = reviews['text'].apply(pre_processing)

In [36]:
reviews.head()

Unnamed: 0,text,label,preprocessed_review
0,"simplistic , silly and tedious . \n",0,simplistic silly tedious
1,"it's so laddish and juvenile , only teenage bo...",0,laddish juvenile teenage boy could possibly fi...
2,exploitative and largely devoid of the depth o...,0,exploitative largely devoid depth sophisticati...
3,[garbus] discards the potential for pathologic...,0,garbus discard potential pathological study ex...
4,a visually flashy but narratively opaque and e...,0,visually flashy narratively opaque emotionally...


In [37]:
pos_reviews['preprocessed_review'] = pos_reviews['text'].apply(pre_processing)
neg_reviews['preprocessed_review'] = neg_reviews['text'].apply(pre_processing)

In [38]:
train_pos = pos_reviews.iloc[:4000]
train_neg = neg_reviews.iloc[:4000]
train_data = pd.concat([train_pos['preprocessed_review'], train_neg['preprocessed_review']], ignore_index=True)
train_labels = pd.concat([train_pos['label'], train_neg['label']], ignore_index=True)

val_pos = pos_reviews.iloc[4000:4500]
val_neg = neg_reviews.iloc[4000:4500]
val_data = pd.concat([val_pos['preprocessed_review'], val_neg['preprocessed_review']], ignore_index=True)
val_labels = pd.concat([val_pos['label'], val_neg['label']], ignore_index=True)

test_pos = pos_reviews.iloc[4500:]
test_neg = neg_reviews.iloc[4500:]
test_data = pd.concat([test_pos['preprocessed_review'], test_neg['preprocessed_review']], ignore_index=True)
test_labels = pd.concat([test_pos['label'], test_neg['label']], ignore_index=True)

In [39]:
X_train, y_train = train_data, train_labels
X_val, y_val = val_data, val_labels
X_test, y_test = test_data, test_labels

In [40]:
X_train.tail()

7995                       nearly hour film way indulgent
7996    gorgeous look insufferably tedious turgid curi...
7997              look much like cartoon end simpson ever
7998    tighter editorial process firmer direction mat...
7999                                  not really add much
Name: preprocessed_review, dtype: object

Please note that following this point, I have tried 2 vector representations, Word2Vec and Glove either can be used, but make sure you dont run all the cells and only run the one you want to use

In [41]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = max(len(seq) for seq in X_train_seq)
avg_len = sum(len(seq) for seq in X_train_seq) / len(X_train_seq)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [42]:
avg_len

10.566625

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [44]:
X_train, y_train =X_train_padded, train_labels
X_val, y_val = X_val_padded, val_labels
X_test, y_test = X_test_padded, test_labels

In [45]:
tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(train_data)  
X_val_tfidf = tfidf.transform(val_data)          
X_test_tfidf = tfidf.transform(test_data) 

In [46]:
lr_model = LogisticRegression(max_iter=5000)
lr_model.fit(X_train_tfidf, train_labels)

In [47]:
y_val_pred = lr_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(val_labels, y_val_pred)  
print(f'Validation Accuracy: {val_accuracy:.4f}')

Validation Accuracy: 0.7710


In [48]:
y_test_pred = lr_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(test_labels, y_test_pred)  
print(f'Test Accuracy: {test_accuracy:.4f}')

Test Accuracy: 0.7533


In [49]:
f1_test = f1_score(test_labels, y_test_pred)
print(f'Test F1 Score: {f1_test:.4f}')

print("\nClassification Report:")
print(classification_report(test_labels, y_test_pred, target_names=["Negative", "Positive"]))

Test F1 Score: 0.7500

Classification Report:
              precision    recall  f1-score   support

    Negative       0.75      0.77      0.76       831
    Positive       0.76      0.74      0.75       831

    accuracy                           0.75      1662
   macro avg       0.75      0.75      0.75      1662
weighted avg       0.75      0.75      0.75      1662

