## Natual Language Processing for Sentiment Analysis by using Amazon reviews dataset

In [None]:

# For Numeric and Data handling -
# 
# PanDas
# NumPy
# bz2
# 
# For environmental uses -
# os
# 
# For string processing -
# re
# 
# For Preprocessing -
# 
# Tensorflow Preprocessing
# Tokenizer
# t2w sequence
# pad sequences
# 
# For Modelling -
# Tensorflow
# 
# For metrics -
# Confusion metrix
# F1 Score
# ROC AUC Score
# Accuracy Scores
# 
# For Plotting -
# Matplotlib
# Seaborn

# ## 1. Import Library 

In [None]:
# Basic Libraries 
import pandas as pd
import numpy as np
from tqdm import tqdm
import bz2
import zipfile
import os

# NLTK libraries
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from tqdm.auto import tqdm

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

# Metric Libraries 
from sklearn.feature_extraction.text import CountVectorizer

# ML
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
#!pip install tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

stop_words = stopwords.words('english')
#from gensim import corpora as corpora
#from transformers import AutoTokenizer, AutoModelForSequenceClassification

from tensorflow.keras.layers import Dense,LSTM,SpatialDropout1D,Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
import pickle

## 2. Import dataset

In [None]:
import os
import zipfile
import bz2

# Step 1: Unzip the .zip file to extract the .bz2 file
zip_file_path = '/Users/szuyingpan/Desktop/NLP/CW1/train.ft.txt.bz.zip'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('.')  # Extract all files in the current directory

In [None]:

zip_file_path = '/Users/szuyingpan/Desktop/NLP/CW1/test.ft.txt.bz.zip'  # Make sure this path is correct

# Check if the file exists
if os.path.exists(zip_file_path):
    print("The file exists.")
else:
    print("The file does not exist. Please check the path.")

In [None]:
def read_text_file(file_path):
    """Reads a text file into a list of strings."""
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines

# the decompressed files 
train_data = read_text_file('train.ft.txt')
test_data = read_text_file('test.ft.txt')

print(f"Number of training samples: {len(train_data)}")
print(f"Number of test samples: {len(test_data)}")

In [None]:
def read_and_preview_file(file_path, num_lines=5):
    """Reads a file and prints a preview of the first few lines."""
    with open(file_path, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i >= num_lines: break
            print(line.strip())  # Print line with trailing newline stripped

# Preview the first few lines of the train and test files
read_and_preview_file('train.ft.txt')
read_and_preview_file('test.ft.txt')

In [None]:
data = []

# Read the archived file line by line, and add it to the list
for line in bz2.open("train.ft.txt.bz", "rt", encoding="utf8"):
    # label 1 is negative and label 2 is positive
    label = 1 if line.startswith("__label__1") else 2
    text = line[10:].strip()  # Remove the label and any leading/trailing whitespace

    localResult = {
        "label": label,
        "text": text
    }

    data.append(localResult)

train_df = pd.DataFrame(data)
#df = df.reset_index().rename(columns= {"index": "Id"})
train_df = train_df.rename(columns= {"text": "review"})
train_df.head(20)

In [None]:
data2 = []

# Read the archived file line by line, and add it to the list
for line in bz2.open("test.ft.txt.bz", "rt", encoding="utf8"):
    # label 1 is negative and label 2 is positive
    label = 1 if line.startswith("__label__1") else 2
    text = line[10:].strip()  # Remove the label and any leading/trailing whitespace

    localResult = {
        "label": label,
        "text": text
    }

    data2.append(localResult)

test_df = pd.DataFrame(data)
test_df = test_df.rename(columns= {"text": "review"})
test_df.head(20)

In [None]:
print("the shape of the data", train_df.shape)

In [None]:
print("unique labels", train_df["label"].unique())

In [None]:
valueCounts = train_df["label"].value_counts().sort_index()
print(valueCounts)

## 3. Structural cleaning 

In [None]:
# Remove duplicate rows
train_df = train_df.drop_duplicates()

# Handle missing values - 'review' is main column
train_df = train_df.dropna(subset=['review'])  # Remove rows where 'review' is missing
print(train_df.head(10))

In [None]:
print("the shape of the data", train_df.shape)

# There is no mssing or duplicate rows

## 4. Text Preprocessing 

In [None]:
#Count of good and bad reviews
count=train_df['label'].value_counts()
print('Total Counts of both sets'.format(),count)

print("==============")
#Creating a function to plot the counts using matplotlib
def plot_counts(count_good,count_bad):
    plt.rcParams['figure.figsize']=(6,6)
    plt.bar(0,count_good,width=0.6,label='Positive Reviews',color='Green')
    plt.legend()
    plt.bar(2,count_bad,width=0.6,label='Negative Reviews',color='Red')
    plt.legend()
    plt.ylabel('Count of Reviews')
    plt.xlabel('Types of Reviews')
    plt.show()
    
count_good=train_df[train_df['label']== 2]
count_bad=train_df[train_df['label']== 1]
plot_counts(len(count_good),len(count_bad))

# We can see that two classes are equal in the training set. 

In [None]:
good_reviews = train_df[train_df['label'] == 2]['review']
bad_reviews = train_df[train_df['label'] == 1]['review']
print(good_reviews[:10])
print(bad_reviews[:10])

In [None]:
#Analyse the count of words in each segment- both positive and negative reviews

#Function for checking word length
def cal_len(data):
    return len(data)

#Create generic plotter with Seaborn
def plot_count(count_ones,count_zeros,title_1,title_2,subtitle):
    fig,(ax1,ax2)=plt.subplots(1,2,figsize=(15,5))
    sns.distplot(count_zeros,ax=ax1,color='Blue')
    ax1.set_title(title_1)
    sns.distplot(count_ones,ax=ax2,color='Red')
    ax2.set_title(title_2)
    fig.suptitle(subtitle)
    plt.show()    

count_good_words = good_reviews.str.split().apply(lambda z:cal_len(z))
count_bad_words = bad_reviews.str.split().apply(lambda z:cal_len(z))
print("Positive Review Words:" + str(count_good_words))
print("Negative Review Words:" + str(count_bad_words))
plot_count(count_good_words,count_bad_words,"Positive Review","Negative Review","Reviews Word Analysis")

Count Punctuations/Stopwords/Codes and other semantic datatypes

Punctuation marks can convey significant information about sentence structure and tone, which might be crucial for certain NLP tasks like sentiment analysis or natural language understanding. We will be using the "generic_plotter" function.

In [None]:
count_good_punctuations=count_good['review'].apply(lambda z: len([c for c in str(z) if c in string.punctuation]))
count_bad_punctuations=count_bad['review'].apply(lambda z:len([c for c in str(z) if c in string.punctuation]))
plot_count(count_good_punctuations,count_bad_punctuations,"Positive Review Punctuations","Negative Review Punctuations","Reviews Word Punctuation Analysis")

Punctuation marks can convey significant information about sentence structure and tone, which might be crucial for certain NLP tasks like sentiment analysis or natural language understanding.
Removal: In many NLP tasks, especially those focused on understanding the general content or topic of the text (like topic modeling or keyword extraction), punctuation might not add useful information and can be removed to reduce the complexity of the text data.
Preservation: In tasks like text generation, machine translation, or emotion detection, preserving punctuation can be critical as it affects readability and the conveyed emotions or nuances.

In [None]:
# Analyse Stopwords
# Stopwords are common words (such as "the", "is", "in") that are usually filtered out 
# in the preprocessing phase because they occur frequently and are believed to carry 
# little meaningful information about the content of the text.

def plot_count_1(count_ones,count_zeros,title_1,title_2,subtitle):
    fig,(ax1,ax2)=plt.subplots(1,2,figsize=(15,5))
    sns.distplot(count_zeros,ax=ax1,color='Blue')
    ax1.set_title(title_1)
    sns.distplot(count_ones,ax=ax2,color='Orange')
    ax2.set_title(title_2)
    fig.suptitle(subtitle)
    plt.show()    

stops=set(stopwords.words('english'))
count_good_stops=count_good['review'].apply(lambda z : np.mean([len(z) for w in str(z).split()]))
count_bad_stops=count_bad['review'].apply(lambda z : np.mean([len(z) for w in str(z).split()]))
plot_count_1(count_good_stops,count_bad_stops,"Positive Reviews Stopwords","Negative Reviews Stopwords","Reviews Stopwords Analysis")

Removing stopwords is to reduce the dimensionality of the text data, which can improve the performance of NLP models by focusing on more informative words. 
However, in certain NLP tasks, stopwords can provide important context and should be preserved. For instance, in phrase-based sentiment analysis ("not good" versus "good"), stopwords like "not" drastically change the meaning. Similarly, in language modeling and machine translation, stopwords are crucial for generating syntactically correct sentences.

In [None]:
def display_simple_cloud(data, color='black', max_words=20000):
    plt.figure(figsize=(10,10))
    wc = WordCloud(stopwords=STOPWORDS, 
                   background_color="white", 
                   contour_width=2, 
                   contour_color=color,
                   max_words=max_words,  # Reduced number of words
                   width=800,            # Standard width
                   height=400)           # Standard height
    wc.generate(' '.join(data[:20000]))  # Generate from the first 20000 reviews (as an example)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis('off')
    plt.show()

display_simple_cloud(good_reviews.tolist(), 'red')  # Convert DataFrame column to list

In [None]:
display_simple_cloud(bad_reviews.tolist(), 'blue') 

Note: Our task is sentiment analysis which is context or syntext-based tasks, maintainig punctuation and handling stoptwords can be essential to preserve the original meaning and structure of the text. For example, in phrase-based sentiment analysis ("not good" versus "good"), stopwords like "not" drastically change the meaning. Similarly, in language modeling and machine translation, stopwords are crucial for generating syntactically correct sentences.

In [None]:
# Removing punctuation? visualise good reviews 

from collections import Counter

# Rreviews is a list of review texts
reviews = good_reviews

# Combine all reviews into one large string
all_reviews = " ".join(reviews)

# Count all punctuation marks in the reviews
punctuation_counts = Counter(c for c in all_reviews if c in string.punctuation)

# Visualize the counts
plt.figure(figsize=(10, 6))
plt.bar(punctuation_counts.keys(), punctuation_counts.values())
plt.title('Frequency of Punctuation Marks in Reviews')
plt.xlabel('Punctuation Mark')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualise bad reviews

# Rreviews is a list of review texts
reviews = bad_reviews

# Combine all reviews into one large string
all_reviews = " ".join(reviews)

# Count all punctuation marks in the reviews
punctuation_counts = Counter(c for c in all_reviews if c in string.punctuation)

# Visualize the counts
plt.figure(figsize=(10, 6))
plt.bar(punctuation_counts.keys(), punctuation_counts.values())
plt.title('Frequency of Punctuation Marks in Reviews')
plt.xlabel('Punctuation Mark')
plt.ylabel('Frequency')
plt.show()

From the graph, we can see that punctuation marks with emotional connotations, such as "!" and "?", occur less frequently in the text. In this case, we will remove them in the next step.

## 4.1 Text Cleaning and Normalisation

In [None]:
%%time
import re

# Removes Punctuations
def remove_punctuations(text):
    punct_tag = re.compile(r'[^\w\s]')
    text = punct_tag.sub(r'', text)
    return text

# Removes HTML syntaxes
def remove_html(text):
    html_tag = re.compile(r'<.*?>')
    text = html_tag.sub(r'', text)
    return text

# Removes URL data
def remove_url(text):
    url_clean = re.compile(r"https://\S+|www\.\S+")
    text = url_clean.sub(r'', text)
    return text

# Removes Emojis
def remove_emoji(text):
    emoji_clean = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_clean.sub(r'', text)
    return text

# Convert text to lowercase
def to_lowercase(text):
    return text.lower()


# Applying all functions to the 'review' column
train_df['review'] = train_df['review'].apply(lambda z: remove_punctuations(z))
train_df['review'] = train_df['review'].apply(lambda z: remove_html(z))
train_df['review'] = train_df['review'].apply(lambda z: remove_url(z))
train_df['review'] = train_df['review'].apply(lambda z: remove_emoji(z))
train_df['review'] = train_df['review'].apply(lambda z: to_lowercase(z))

In [None]:
print(train_df.head(10))

Since our task is sentiment analysis, we can remove unnecessary information such as punctuation, urls and convert text to lowercase to ensure consistency. I decide to remove emoji because in this task has labels 1 and 2 to classify pasitive and negative. Beside, the dataset size is large. In order to redundancy reduction, the emoji will also remove. 

Now the dataset was cleaned we can move to tokenization step.  

## 4.2 Batch Tokenisztion

In [None]:
#from multiprocessing import Pool
#import pandas as pd

# Update to a simpler, faster tokenizer for demonstration
#def tokenize_and_remove_stopwords(text, stop_words):
    # Simple space-based tokenization
#    tokens = text.lower().split()
#    filtered_tokens = [token for token in tokens if token not in stop_words]
#    return filtered_tokens

#def process_chunk(chunk):
    # Process each text in the chunk using the tokenizer and stop words removal
#    return [tokenize_and_remove_stopwords(text, stop_words) for text in chunk]

# Prepare for parallel processing
#if __name__ == '__main__':
    # Setup multiprocessing pool (adjust processes number as per your system's capability)
#    pool = Pool(processes=4)  # Example: 4 parallel processes

    # Split data into chunks for processing
#    chunks = [train_df['review'][i:i + CHUNK_SIZE] for i in range(0, len(train_df['review']), CHUNK_SIZE)]

    # Use pool.map to process chunks in parallel
#    processed_chunks = pool.map(process_chunk, chunks)

    # Close the pool and wait for the work to finish
#    pool.close()
#    pool.join()

In [None]:
BATCH_SIZE = 256
MAX_FEATURES = 1000
EMBEDDING_DIM = 120
CHUNK_SIZE = 50000  

In [None]:
import concurrent.futures

# Define tokenize_and_remove_stopwords and process_chunk directly in the notebook
def tokenize_and_remove_stopwords(text, stop_words):
    # Simple space-based tokenization
    tokens = text.lower().split()
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

def process_chunk(chunk):
    # Process each text in the chunk using the tokenizer and stop words removal
    return [tokenize_and_remove_stopwords(text, stop_words) for text in chunk]

# Assuming train_df and other variables are defined as before

# Use ThreadPoolExecutor to process chunks
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # Submit tasks to the executor
    futures = [executor.submit(process_chunk, chunk) for chunk in content_chunks]

    # Wait for all tasks to complete and collect results
    processed_chunks = [future.result() for future in concurrent.futures.as_completed(futures)]

## 4.3 Preparing data for modeling

In [None]:
# Convert tokenized text into numerical format and then pad or truncate the sequences to have uniform length

# Flatten the list of processed chunks if they are in nested lists
processed_texts = [word for chunk in processed_chunks for word in chunk]

# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(processed_texts)
sequences = tokenizer.texts_to_sequences(processed_texts)

# Pad sequences to ensure uniform input size
data = pad_sequences(sequences, maxlen=EMBEDDING_DIM)

## 4.4 Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `labels` is your array of sentiment labels
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

## 4.5 Build the models

In [None]:
# neural network (RNN) with LSTM (Long Short-Term Memory) or GRU (Gated Recurrent Unit) layers, or using a Transformer-based model like BERT.

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=MAX_FEATURES, output_dim=120, input_length=EMBEDDING_DIM))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## 4.6 Training the data

In [None]:
model.fit(X_train, y_train, validation_split=0.1, epochs=10, batch_size=BATCH_SIZE)

## 4.7 Evaluation

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

In [None]:
# Convert tokenized text into numerical format and then pad or truncate the sequences to have uniform length

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize tokenizer with a specified number of words to keep
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(processed_chunks)  # Fit on processed text data

# Convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(processed_chunks)

# Pad sequences to ensure uniform length
data = pad_sequences(sequences, maxlen=EMBEDDING_DIM)

## Process the dataset in chunks and tokenize each chunk separately 

BATCH_SIZE = 256
MAX_FEATURES = 100
EMBEDDING_DIM = 100
CHUNK_SIZE = 10000

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stop_words.remove('not')  # Preserve 'not' for sentiment analysis

def process_chunk(texts):
    # Tokenize and remove stop words in batches
    tokenized_texts = [word_tokenize(text.lower()) for text in texts]  # Added lowercasing here
    filtered_texts = [[word for word in text if word not in stop_words] for text in tokenized_texts]
    return filtered_texts

## Prepare content_chunks using the 'review' column from the dataframe
content_chunks = [train_df['review'][i:i + CHUNK_SIZE] for i in range(0, len(train_df['review']), CHUNK_SIZE)]

processed_chunks = []
for chunk in content_chunks:
    processed_chunk = process_chunk(chunk)
    processed_chunks.extend(processed_chunk)

## 4.3 Preparaing for Machine Learning 

## 4.2 Tokenisation and pad _sequence

In [None]:
# the tokenizer will only consider the top 20,000 most common words in the dataset
voc_size = 20000

# maximum length of the sequences (lists of tokens) to 100
max_length = 100

# converting only 20,000 common words into sequences of integer
tokenizer = Tokenizer(num_words=voc_size)

# mapping from words to integer 
tokenizer.fit_on_texts(train_df)

# word_index is a dictionary mapping words to integer representation. 
# This can be useful for understanding the tokenization mapping or for further processing.
word_index = tokenizer.word_index

# open a file named tokenizer.pkl in write-binary mode ('wb')
# The tokenizer object is serialized (converted into a byte stream) using Python's pickle module and 
# saved to the file. This allows the tokenizer to be saved to disk, making it reusable for 
# future sessions or other models without the need to re-fit it to the text data.
with open('/Users/szuyingpan/Desktop/NLP/CW1/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
# Tokenizing the text in the training and testing data
#train_sequences = tokenizer.texts_to_sequences(train_df['review'])  
#test_sequences = tokenizer.texts_to_sequences(test_df['reviewt'])   

In [None]:
train = tokenizer.texts_to_sequences(train_df['review'])
#train = pad_sequences(train_df, maxlen=max_length)
test = tokenizer.texts_to_sequences(test_df['review'])
#test = pad_sequences(test_df, maxlen=max_length)

In [None]:
train_lab=np.array([1 if i=='2' else 0 for i in train_label])
test_lab=np.array([1 if i=='2' else 0 for i in test_label])

In [None]:
# Because the dataset is too large to proceed tokenisation step.
# Here I tokenise text with spaCy, utilizing its nlp.pipe() method for efficiency and tqdm for progress
#!pip install spacy


# Load the spaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) # Disabling unnecessary components

In [None]:
# Extract documents from the DataFrame column
documents = train_df['review'].tolist()

# Process documents in batches and use tqdm for progress indication
tokenized_docs = []
for doc in tqdm(nlp.pipe(documents, batch_size=50), total=len(documents)):
    # Extract tokens from each document
    tokens = [token.text for token in doc]
    tokenized_docs.append(tokens)

# If you want to add the tokenized texts back into the DataFrame
train_df['tokenized_review'] = tokenized_docs

# At this point, `train_df` contains a new column 'tokenized_review' with the tokenized versions of your documents

In [None]:

train_df['tokenized_review'] = train_df['tokenized_review'].apply(lambda x: ' '.join(x))

# Save the updated DataFrame to CSV, which now includes the original reviews and their tokenized versions
train_df.to_csv("tokenized_reviews.csv", index=False)

In [None]:
import pandas as pd

# Convert tokenized_docs to DataFrame if it makes sense for your dataset
df = pd.DataFrame({"Tokenized_Text": tokenized_docs})

# Save to CSV
df.to_csv("tokenized_reviews.csv", index=False)

### 4.3 Stop words removal 

In [None]:
# Download stop words
nltk.download('stopwords')

# Load the default list of stop words and then remove 'not' from it
stop_words = set(stopwords.words('english'))
stop_words.remove('not')

# Now filter text using this customized list
filtered_sentence = [word for word in tokenized_sentence if word not in stop_words]

### 4.4 Create n-grams

In [None]:
# create uni-gram
# We'll use CountVectorizer for this, but just treat it as formal tokenization here
vectorizer_unigram = CountVectorizer(analyzer='word')  # default is unigram/1-gram
unigram_data = vectorizer_unigram.fit_transform(train_df['review'])

In [None]:
# Create bigrams, set ngram_range to (2,2) for bigrams
vectorizer_bigram = CountVectorizer(analyzer='word', ngram_range=(2,2))
bigram_data = vectorizer_bigram.fit_transform(train_df['review'])

In [None]:
# Create trigrams
#vectorizer_trigram = CountVectorizer(analyzer='word', ngram_range=(3,3))
#trigram_data = vectorizer_trigram.fit_transform(train_df['review'])

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sample text
text = "This is an example sentence to demonstrate not removing 'not' from the stop words."

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load the default list of stop words and then remove 'not' from it
stop_words = set(stopwords.words('english'))
stop_words.remove('not')

# Tokenize the sample text
tokenized_sentence = word_tokenize(text)

# Filter out stop words from the tokenized sentence
filtered_sentence = [word for word in tokenized_sentence if word not in stop_words]

# Let's see the filtered sentence
print(filtered_sentence)

In [None]:
# Combining Unigrams, Bigrams, and Trigrams
# Set ngram_range to (1,3) to get unigrams, bigrams, and trigrams
vectorizer_trigram = CountVectorizer(analyzer='word', ngram_range=(3,3))
trigram_data = vectorizer_trigram.fit_transform(train_df['review'])

## 6. Transforming the corpus

In [None]:
# How to evaluate how many n-grams in the summary?

# See PPT p.76 skip-gram use ROUGE-S 

In [None]:
#def lemmmatization(text, allowed_postages=['NOUN','ADJ','VERB','ADV']):
#    nlp = spacy.load('en_core_')

## 7. Model Selection and Training

### Transformer 
BERT Embeddings¶
BERT is a traditional SOTA transformer architecture published by Google Research which uses bidirectional pretraining . The importance of using BERT is that it has 2 important aspects:

Msked Language Model (MLM)
Next Sentence Prediction(NSP)
The bidirectional pre-training is essentially helpful to be used for any tasks. The Huggingface implementation is helpful for fine-tuning BERT for any language modelling task. The BERT architecture falls under an encoder-decoder(Transformer) model as follows:

## 8. Evaluation Metrics 

In [None]:
# F1,Recall, Precision
# Confusion metrix

## 9.Reference 

In [None]:
https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/