# Movie Review Sentiment Classification Challenge
# **Author: MICADEE**

**Download already saved datasets from gdrive to colab using "gdown":**

In [None]:
%%capture
!gdown https://drive.google.com/uc?id=1k9Mco0zM4J1KX4yUO0nWXBcffkfVD0wy
!unzip -qq Movies_data.zip

**Install all necessary libraries:**

In [1]:
%%capture
!pip install -qq emoji==1.6.3 --quiet
!pip install -qq transformers --quiet
!pip install -qq matplotlib==3.4 --quiet

## **Note**:
**To avoid any error inside this notebook, make sure and confirm that the package version of matplotlib below is excatly 3.4, otherwise restart and rerun this colab notebook:**

In [None]:
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))

Great !!! we have mayplotlib version of exactly 3.4.

Here we go !!!

# STAGE 1:
# DATA CLEANING PROCEDURE

**Import all necessary libraries:**

In [None]:
import nltk
import emoji
import torch
import random
import warnings
import re, string
import numpy as np
import pandas as pd
import pandas as pd
import seaborn as sns
import torch.nn as nn
from tqdm import tqdm
import tensorflow as tf
import sklearn.exceptions
from pylab import rcParams
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn import preprocessing
from transformers import TFBertModel
from transformers import TFRobertaModel
from transformers import BertTokenizerFast
from transformers import RobertaTokenizerFast
from sklearn.naive_bayes import MultinomialNB
from torch.utils.data import DataLoader, Dataset
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split , KFold , StratifiedKFold
from transformers import BertModel, BertTokenizer, AdamW,  get_linear_schedule_with_warmup, set_seed

#set style for plots
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

**Set seed for reproducibility:**

In [None]:
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed =42
seed_all(seed)

**Loading the dataset:**

In [None]:
df = pd.read_csv("Train.csv")
df_test = pd.read_csv("Test.csv")
sub = pd.read_csv("SampleSubmission.csv")

print(df.shape, df_test.shape)

NOTE: UTF-8 encoding does not work on the dataset when loading it with pandas 'read_csv' function. This lead to the use of 'ISO-8859-1'/latin-1 encoding. <br>
It will be found later that some special characters like apostrophes are turned into '\x92', which will be taken care of during the data cleaning process.
              

In [None]:
df['sentiment'].value_counts()

In [None]:
sentiment_map = {'negative':0,'positive':1}
df['sentiment'] = df['sentiment'].map(sentiment_map)

In [None]:
df.head()

In [None]:
df.info()

## Check if there's duplicated tweets?

In [None]:
df.drop_duplicates(subset='content',inplace=True)
df.shape

Good news, there are few duplicate tweets !

# Tweets Deep Cleaning

In the following, we will perform some data cleaning on the raw text of the tweets.

In [None]:
df = df[['review_file', 'content',	'sentiment']]
df_test = df_test[['review_file','content']]

#DEFINE CUSTOM FUNCTIONS TO CLEAN THE TEXT OF THE TWEETS:

In [None]:
# Clean emojis from text
def strip_emoji(text):
    return re.sub(emoji.get_emoji_regexp(), r"", text)   # remove emoji

# Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text):
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

# Clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

# Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [None]:
texts_new = []
for t in df.content:
    texts_new.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(strip_emoji(t))))))

In [None]:
texts_new_test = []
for t in df_test.content:
    texts_new_test.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(strip_emoji(t))))))

Now we can create a new column, for both train and test sets, to host the cleaned version of the tweets' text.

In [None]:
df['text_clean'] = texts_new
df_test['text_clean'] = texts_new_test

In [None]:
df['text_clean'].head()

In [None]:
df_test['text_clean'].head()

In [None]:
df['text_clean'][1:8].values

Moreover, we will also create a column to host the lenght of the cleaned text, to check if by cleaning the text we removed too much text or almost entirely the tweet!

In [None]:
text_len = []
for text in df.text_clean:
    tweet_len = len(text.split())
    text_len.append(tweet_len)

In [None]:
df['text_len'] = text_len

In [None]:
text_len_test = []
for text in df_test.text_clean:
    tweet_len = len(text.split())
    text_len_test.append(tweet_len)

In [None]:
df_test['text_len'] = text_len_test

In [None]:
plt.figure(figsize=(7,5))
ax = sns.countplot(x='text_len', data=df[df['text_len']<30], palette='mako')
plt.title('Training tweets with less than 30 words')
plt.yticks([])
ax.bar_label(ax.containers[0])
plt.ylabel('count')
plt.xlabel('')
plt.show()

In [None]:
plt.figure(figsize=(7,5))
ax = sns.countplot(x='text_len', data=df_test[df_test['text_len']<30], palette='mako')
plt.title('Training tweets with less than 30 words')
plt.yticks([])
ax.bar_label(ax.containers[0])
plt.ylabel('count')
plt.xlabel('')
plt.show()

As we can see, there are lots of cleaned tweets (2 precisely) with just ten words inside train dataset: this is due to the cleaning performed before. This means that some tweets contained only mentions, hashtags and links, which have been removed. We will drop these empty tweets and also those with less than 1 word.

In [None]:
print(f" DF SHAPE: {df.shape}")
print(f" DF TEST SHAPE: {df_test.shape}")

In [None]:
df = df[df['text_len'] > 1]

In [None]:
print(f" DF SHAPE: {df.shape}")
print(f" DF TEST SHAPE: {df_test.shape}")

# STAGE 1(A):
# Training data deeper cleaning

Let's perform a further cleaning checking the tokenizer version of the sentences.

First, we import the BERT tokenizer.

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [None]:
token_lens = []

for txt in df['text_clean'].values:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))

max_len=np.max(token_lens)

In [None]:
print(f"MAX TOKENIZED SENTENCE LENGTH: {max_len}")

Let's check the long tokenized sentences (with more than 40 tokens ):

In [None]:
token_lens = []

for i,txt in enumerate(df['text_clean'].values):
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
    if len(tokens)>400:
        print(f"INDEX: {i}, TEXT: {txt}")

In [None]:
df['token_lens'] = token_lens

df = df.sort_values(by='token_lens', ascending=False)
df.head(20)

In [None]:
df = df.iloc[12:]
df.head()

The dataset looks more clean now. We will shuffle it and reset the index.

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

# STAGE 1(B):
## Test data deeper cleaning

We will perform the data cleaning based on the tokenized sentences on the test set.

In [None]:
token_lens_test = []

for txt in df_test['text_clean'].values:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens_test.append(len(tokens))

max_len=np.max(token_lens_test)

In [None]:
print(f"MAX TOKENIZED SENTENCE LENGTH: {max_len}")

In [None]:
token_lens_test = []

for i,txt in enumerate(df_test['text_clean'].values):
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens_test.append(len(tokens))
    if len(tokens)>400:
        print(f"INDEX: {i}, TEXT: {txt}")

In [None]:
df_test['token_lens'] = token_lens_test

In [None]:
df_test.head(10)

In [None]:
df.shape, df_test.shape

Now the data cleaning is completed.

# Let's save the final preprocessed datasets:

In [None]:
df.to_csv("train_preprocessed.csv", index = False)
df_test.to_csv("test_preprocessed.csv", index = False)