In [None]:
# Import the necessary libraries
import pandas as pd
import contractions
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import string
from nltk.corpus import stopwords, wordnet
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
!pip install contractions

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Load the data
df_train_labeled = pd.read_csv("labeledTrainData.tsv", 
                              header=0, 
                              delimiter="\t", 
                              quoting=3)

df_train_unlabeled = pd.read_csv("unlabeledTrainData.tsv",
                              header=0, 
                              delimiter="\t", 
                              quoting=3)

df_test = pd.read_csv("testData.tsv",
                      header=0, 
                      delimiter="\t", 
                      quoting=3)

In [None]:
# Read the data
df_train_labeled.to_csv('df_train_labeled.tsv', sep = '\t', index = False)
pd.read_csv('df_train_labeled.tsv', sep = '\t').head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [None]:
df_train_labeled.shape

(25000, 3)

In [None]:
df_train_labeled['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [None]:
# Remove quotations from string
df_train_labeled['review'] = df_train_labeled['review'].str.strip('" "')
df_train_labeled.head()

In [None]:
# Expand contractions
df_train_labeled['no_contract'] = df_train_labeled['review'].apply(lambda x: [contractions.fix(word) for word in x.split()])
df_train_labeled.head()

In [None]:
# Convert the lists under the 'no_contract' column back to strings
df_train_labeled['review_str'] = [' '.join(map(str, l)) for l in df_train_labeled['no_contract']]
df_train_labeled.head()

In [None]:
# Apply the tokenizer to split each indivisual word into a token
df_train_labeled['tokenized'] = df_train_labeled['review_str'].apply(word_tokenize)
df_train_labeled.head()

In [None]:
# Convert all characters to lowercase
df_train_labeled['lower'] = df_train_labeled['tokenized'].apply(lambda x: [word.lower() for word in x])
df_train_labeled.head()

In [None]:
# Remove punctuations from the corpus
punc = string.punctuation
df_train_labeled['no_punc'] = df_train_labeled['lower'].apply(lambda x: [word for word in x if word not in punc])
df_train_labeled.head()

In [None]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
df_train_labeled['stopwords_removed'] = df_train_labeled['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
df_train_labeled.head()

In [None]:
# Apply parts of speech tags for each word
df_train_labeled['pos_tags'] = df_train_labeled['stopwords_removed'].apply(nltk.tag.pos_tag)
df_train_labeled.head()

In [None]:
# Convert parts of speech tags to wordnet's format
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

df_train_labeled['wordnet_pos'] = df_train_labeled['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
df_train_labeled.head()

In [None]:
# Apply nltk's word lemmatizer within the trusty list comprehension
wnl = WordNetLemmatizer()
df_train_labeled['lemmatized'] = df_train_labeled['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
df_train_labeled.head()

In [None]:
# Drop unuseful columns
df_train_labeled.drop(['no_contract', 'review_str', 'tokenized', 'lower', 'no_punc', 'stopwords_removed', 'pos_tags', 'wordnet_pos'], axis=1)

In [None]:
# Save this work into a csv file
df_train_labeled.to_csv('labeled_review_clean.csv')