##Importing Libraries

In [None]:
import re
import string
import numpy as np 
import random
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator ##is a data visualization technique used
#for representing text data in which the size of each word indicates its frequency


import nltk
from nltk.corpus import stopwords

from tqdm import tqdm ##new progress bars repeatedly
import os 
import nltk ##building Python programs to work with human language data
import spacy #for training the NER model tokenize words
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")



##Mounting the Drive and loading the data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
df_train = pd.read_csv('gdrive/My Drive/Sentiment Analysis/train.csv')
df_test = pd.read_csv('gdrive/My Drive/Sentiment Analysis/test.csv')
df_submission = pd.read_csv('gdrive/My Drive/Sentiment Analysis/sample_submission.csv')

In [None]:
df_train.head(5)

Unnamed: 0,textID,text,selected_text,sentiment,Num_words_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,7
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,10
2,088c60f138,my boss is bullying me...,bullying me,negative,5
3,9642c003ef,what interview! leave me alone,leave me alone,negative,5
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,14


In [None]:
df_test.head(5)

##Modelling
After going through some discussion forums,this problem can be modelled as following:-

Named Entity Recognition

Q&A Problem

##1)Modelling the Problem as NER
Named Entity Recognition (NER) is a standard NLP problem which involves spotting named entities (people, places, organizations etc.) from a chunk of text, and classifying them into a predefined set of categories. For understanding NER here is very good article : https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

We will be using spacy for creating our own customised NER model or models (seperate for each Sentiment).

What will be different with my solution:

I will use text as selected_text for all neutral tweets due to their high jaccard similarity
Also I will use text as selected_text for all tweets having number of words less than 3 in text as explained before
I will train two different models for Positive and Negtive tweets
I will not preprocess the data because the selected text contains raw text

In [None]:
#Creating a column with the number of texts
df_train['Num_words_text'] = df_train['text'].apply(lambda x:len(str(x).split())) #Number Of words in main Text in train set

In [None]:
# dropping the Na's    #Inplace=T ensures that the data used throughout the notebook has no Na's
df_train.dropna(inplace=True)

In [None]:
#Based on our Jaccard score, we only want to deal mostly with positive and negative tweets
df_train = df_train[df_train['Num_words_text']>=3]

#We ended up using Spacy's already trained models for sentiment analysis since we do not have labels in our data
This is because:
spaCy’s models are statistical and every “decision” they make – for example, which part-of-speech tag to assign, or whether a word is a named entity – is a prediction.

In [None]:

def save_model(output_dir, nlp, new_model_name):
    ''' This Function Saves model to 
    given output directory'''
    
    output_dir = f'../Output/{output_dir}'
    if output_dir is not None:        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [None]:
# pass model = nlp if you want to train on top of existing model 

def train(train_data, output_dir, n_iter=20, model=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    ""
    if model is not None:
        nlp = spacy.load(output_dir)  # load existing spaCy model from the drive
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class(english language)
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # adding the  labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training apart from ner pipe
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()
# loss function  calculates the difference between the training example and the expected output. 
#The greater the difference, the more significant the gradient and the updates to our model.
#Reduce the loss function to have the model with the least error
#use tqdm to Instantly make our loops show a smart progress meter
        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data) #randomise the data to prevent the data from memorizing
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,  # batch of texts
                            annotations,  # batch of annotations
                            drop=0.5,   # dropout - make it harder to memorise data
                            losses=losses, 
                            )
            print("Losses", losses)
    save_model(output_dir, nlp, 'st_ner')

In [None]:
def get_model_out_path(sentiment):
    '''
    Returns Model output path per given sentiment
    '''
    model_out_path = None
    if sentiment == 'positive':
        model_out_path = 'models/model_pos'
    elif sentiment == 'negative':
        model_out_path = 'models/model_neg'
    return model_out_path

In [None]:
def get_training_data(sentiment):
    '''
    Returns Training data in the format needed to train spacy NER
    '''
    train_data = []
    for index, row in df_train.iterrows():
        if row.sentiment == sentiment:
            selected_text = row.selected_text
            text = row.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [[start, end, 'selected_text']]}))
    return train_data

##Training Models for Negative and Positive Tweets

In [None]:
sentiment = 'positive'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)

train(train_data, model_path, n_iter=3, model=None)

  0%|          | 0/3 [00:00<?, ?it/s]

Created blank 'en' model


 33%|███▎      | 1/3 [00:50<01:40, 50.22s/it]

Losses {'ner': 33180.392996583454}


 67%|██████▋   | 2/3 [01:40<00:50, 50.14s/it]

Losses {'ner': 30022.98573069785}


100%|██████████| 3/3 [02:30<00:00, 50.20s/it]

Losses {'ner': 29243.4466882204}
Saved model to ../Output/models/model_pos





The output above shows the three iterations, the model is good because the loss function/error reduces per iteration

In [None]:
sentiment = 'negative'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)

train(train_data, model_path, n_iter=3, model=None)

  0%|          | 0/3 [00:00<?, ?it/s]

Created blank 'en' model


 33%|███▎      | 1/3 [00:47<01:35, 47.65s/it]

Losses {'ner': 31150.466795625864}


 67%|██████▋   | 2/3 [01:33<00:47, 47.26s/it]

Losses {'ner': 28415.23619190307}


100%|██████████| 3/3 [02:20<00:00, 46.71s/it]

Losses {'ner': 26911.06724020458}
Saved model to ../Output/models/model_neg





The output above shows the three iterations, the model is good because the gradient of the loss function/error reduces per iteration

##Predicting with the trained model
Now using the models, let's predict the selected text of the test data using the sentiment provided

In [None]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return selected_text

In [None]:
selected_texts = []
MODELS_BASE_PATH = '../Output/models/'

if MODELS_BASE_PATH is not None:
    print("Loading Models  from ", MODELS_BASE_PATH)
    model_pos = spacy.load(MODELS_BASE_PATH + 'model_pos')
    model_neg = spacy.load(MODELS_BASE_PATH + 'model_neg')
        
    for index, row in df_test.iterrows():
        text = row.text
        output_str = ""
        if row.sentiment == 'neutral' or len(text.split()) <= 2:
            selected_texts.append(text)
        elif row.sentiment == 'positive':
            selected_texts.append(predict_entities(text, model_pos))
        else:
            selected_texts.append(predict_entities(text, model_neg))
        
df_test['selected_text'] = selected_texts   #A new column called selected text takes the selected texts

Loading Models  from  ../Output/models/


In [None]:
#The submission data now has the selected text column that was predicted in the test data
df_submission['selected_text'] = df_test['selected_text']
df_submission.to_csv("submission.csv", index=False)
display(df_submission.head(10))

Unnamed: 0,textID,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,exciting
2,eee518ae67,Recession
3,01082688c6,happy bday!
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!
5,726e501993,that`s great!! weee!! visitors!
6,261932614e,HATES
7,afa11da83f,completely
8,e64208b4ef,and within a short time of the last clue all ...
9,37bcad24ca,What did you get? My day is alright.. haven`...


In [None]:
df_submission['sentiment']=df_test['sentiment']
display(df_submission.head(10))

Unnamed: 0,textID,selected_text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,exciting,positive
2,eee518ae67,Recession,negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive
5,726e501993,that`s great!! weee!! visitors!,positive
6,261932614e,HATES,negative
7,afa11da83f,completely,negative
8,e64208b4ef,and within a short time of the last clue all ...,neutral
9,37bcad24ca,What did you get? My day is alright.. haven`...,neutral
