# Tweet sentiment extraction
#### This noteboook explore some methods to extract sentiment from a tweet
<ul>
    <li>Space Name Entity Recognition</li>
    <li>Roberta inference model with Pytorch</li>
</ul>

In [1]:
import pandas as pd
train = pd.read_csv("./Data/train.csv")

In [2]:
train.head(5)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


## statistics 
### Explore number of sentiments

In [3]:
import matplotlib.pyplot as plt

train.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

### clean text, remove url link, stop words, stem words

In [4]:
import re
import string
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [5]:
train["text"] = train["text"].apply(lambda x: clean_text(x))
train['selected_text'] = train['selected_text'].apply(lambda x:clean_text(x))

In [12]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer() 

def stem_word(words):
    text = []
    for word in word_tokenize(words):
        text.append(ps.stem(word))
    return " ".join(text)
train["text"] = train["text"].apply(lambda x: stem_word(x))
train['selected_text'] = train['selected_text'].apply(lambda x:stem_word(x))

In [13]:
from collections import Counter
cnt = Counter()
for line in range(len(train)):
    for word in word_tokenize(train.iloc[line, 1]):
        cnt[word] += 1

In [14]:
cnt.most_common(20)

[('im', 3023),
 ('day', 2368),
 ('go', 2363),
 ('get', 1908),
 ('good', 1569),
 ('work', 1483),
 ('love', 1456),
 ('like', 1454),
 ('got', 1238),
 ('dont', 1200),
 ('today', 1114),
 ('time', 1078),
 ('one', 1055),
 ('cant', 1020),
 ('happi', 993),
 ('want', 981),
 ('know', 967),
 ('miss', 961),
 ('thank', 961),
 ('lol', 950)]

The most common words in the dataset exprims some sentiments like love/like, good, don't, miss etc...

In [16]:
from collections import Counter
cnt = Counter()
for line in range(len(Positive_sent)):
    for word in word_tokenize(Positive_sent.iloc[line, 2]):
        cnt[word] += 1
cnt.most_common(10)

[('love', 910),
 ('good', 832),
 ('happi', 739),
 ('thank', 633),
 ('day', 471),
 ('great', 364),
 ('hope', 335),
 ('fun', 287),
 ('nice', 270),
 ('mother', 269)]

In [17]:
from collections import Counter
cnt = Counter()
for line in range(len(Negative_sent)):
    for word in word_tokenize(Negative_sent.iloc[line, 2]):
        cnt[word] += 1
cnt.most_common(10)

[('miss', 550),
 ('im', 452),
 ('sad', 352),
 ('sorri', 302),
 ('hate', 273),
 ('bad', 246),
 ('feel', 242),
 ('dont', 221),
 ('suck', 217),
 ('cant', 201)]

In [18]:
from collections import Counter
cnt = Counter()
for line in range(len(Neutral_sent)):
    for word in word_tokenize(Neutral_sent.iloc[line, 2]):
        cnt[word] += 1
cnt.most_common(10)

[('go', 1041),
 ('im', 1040),
 ('get', 806),
 ('work', 633),
 ('day', 622),
 ('got', 526),
 ('dont', 482),
 ('like', 470),
 ('time', 455),
 ('want', 441)]

### Train Spacy Name Entity Recognition (NER)

In [6]:
def get_train_datas(data):
    train_datas = []
    texts = data.text
    selected_texts = data.selected_text
    for selected_text, text in zip(selected_texts, texts):
        start = text.find(selected_text)
        end = start + len(selected_text)
        train_datas.append((text, {"entities":[(start, end, "selected_text")]}))
    return train_datas

In [7]:
def load_model(pre_model = None, label = None):
    if pre_model is not None:
        nlp = spacy.load(pre_model)
        print("Loaded model '%s'" % pre_model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

        if "ner" not in nlp.pipe_names:
            ner = nlp.create_pipe("ner")
            nlp.add_pipe(ner)
        else:
            ner = nlp.get_pipe("ner")

        if label is not None:
            ner.add_label(label)

    return nlp

In [29]:
def train_model(model, nlp, train_datas, n_iter = 30):
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

    with nlp.disable_pipes(*other_pipes):
        sizes = compounding(1.0, 64.0, 1.001)
        print(train_datas[0])
        for itn in range(n_iter):
            random.shuffle(train_datas)
            batches = minibatch(train_datas, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print(itn, "Losses", losses)

    return nlp

In [25]:
import os
def test_model(ner_model, text):
    doc = ner_model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
        
    return text[ent_array[0][0]:ent_array[0][1]] if len(ent_array) > 0 else text


def save_model(ner_model, output_dir = None, new_model_name = None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        ner_model.meta["name"] = new_model_name
        ner_model.to_disk(output_dir)
        print("Saved model to", output_dir)

In [26]:
def get_model(sentiment, train_datas, more_iters = 30):
    if sentiment == 'positive':
        positive_model_path = "./models/positive_ner/"
        positive_datas = train_datas
        if not os.path.exists(positive_model_path):
            nlp = load_model(label = 'selected_text')
            ner_model_positive = train_model(None, nlp, positive_datas, n_iter=50)
            save_model(ner_model_positive, output_dir = "./models/positive_ner/", new_model_name = "posi_model")
        else:
            ner_model_positive = load_model(positive_model_path)
#             ner_model_positive = spacy.load("/kaggle/working/models")
            if more_iters > 0:
                ner_model_positive = train_model(positive_model_path, ner_model_positive, positive_datas, more_iters)
                save_model(ner_model_positive, output_dir = "./models/positive_ner/", new_model_name = "posi_model")
        return ner_model_positive
    else:
        negative_model_path = "./models/negative_ner/"
        negative_datas = train_datas
        if not os.path.exists(negative_model_path):
            nlp = load_model(label = 'selected_text')
            ner_model_negative = train_model(None, nlp, negative_datas, n_iter=50)
            save_model(ner_model_negative, output_dir = "./models/negative_ner/", new_model_name = "nega_model")
        else:
            ner_model_negative = load_model(negative_model_path)
#             ner_model_negative = spacy.load("/kaggle/working/models_nega")
            if more_iters > 0:
                ner_model_negative = train_model(negative_model_path, ner_model_negative, negative_datas, more_iters)
                save_model(ner_model_negative, output_dir = "./models/negative_ner/", new_model_name = "nega_model")
        return ner_model_negative

In [11]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
Positive_sent = train[train['sentiment']=='positive']
Negative_sent = train[train['sentiment']=='negative']
Neutral_sent = train[train['sentiment']=='neutral']

Positive_sent = Positive_sent.reindex(axis=1)
Positive_sent["text"] = Positive_sent["text"].apply(lambda x: x.strip())
Negative_sent = Negative_sent.reindex(axis=1)
Negative_sent["text"] = Negative_sent["text"].apply(lambda x: x.strip())

Positive_sent["text"] = Positive_sent["text"].apply(lambda text: ' '.join(text_to_word_sequence(
    text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' '
)))
Negative_sent["text"] = Negative_sent["text"].apply(lambda text: ' '.join(text_to_word_sequence(
    text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' '
)))

positive_tweet = get_train_datas(Positive_sent)
negative_tweet = get_train_datas(Negative_sent)

positive_tweet_train = []
for i in range(len(positive_tweet)):
    if positive_tweet[i][0] != "":
        positive_tweet_train.append(positive_tweet[i])
        
negative_tweet_train = []
for i in range(len(negative_tweet)):
    if negative_tweet[i][0] != "":
        negative_tweet_train.append(positive_tweet[i])

In [15]:
from __future__ import unicode_literals, print_function
!pip install spacy
import spacy
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/55/2e/ac00f5c9d01e66cc6ab75eb2a460c9b0dc21ad99a12f810c86a58309e63c/spacy-2.2.4-cp36-cp36m-manylinux1_x86_64.whl (10.6MB)
[K     |████████████████████████████████| 10.6MB 22kB/s  eta 0:00:01    |█████▌                          | 1.8MB 2.8MB/s eta 0:00:04     |██████████▎                     | 3.4MB 2.8MB/s eta 0:00:03███████▌                   | 4.1MB 1.3MB/s eta 0:00:06     |███████████████████▎            | 6.4MB 2.3MB/s eta 0:00:02
Collecting srsly<1.1.0,>=1.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/0e/9a/70bd934dd4d25545c9aa6c8cd4edbac2a33ba9c915439a9209b69f0ec0ad/srsly-1.0.2-cp36-cp36m-manylinux1_x86_64.whl (185kB)
[K     |████████████████████████████████| 194kB 8.1MB/s eta 0:00:01
[?25hCollecting plac<1.2.0,>=0.9.6
  Downloading https://files.pythonhosted.org/packages/86/85/40b8f66c2dd8f4fd9f09d59b22720cffecf1331e788b8a0cab5bafb353d1/plac-1.1.3-py2.py3-none-any.whl
Collecting cyme

In [18]:
ner_model_positive = get_model('positive', positive_tweet_train, more_iters=100)

Created blank 'en' model
('happy mothers day to all beautiful mother may your love shines the world thank you mum', {'entities': [(0, 5, 'selected_text')]})
0 Losses {'ner': 19844.369808229545}
1 Losses {'ner': 19451.511896747164}
2 Losses {'ner': 18303.655233740807}
3 Losses {'ner': 17001.421297371387}
4 Losses {'ner': 16384.183547489345}
5 Losses {'ner': 15690.280764579773}
6 Losses {'ner': 14684.084883540869}
7 Losses {'ner': 14334.895932555199}
8 Losses {'ner': 14110.978977087885}
9 Losses {'ner': 13620.871316948906}
10 Losses {'ner': 13251.155605793}
11 Losses {'ner': 13107.195569038391}
12 Losses {'ner': 12700.933847650595}
13 Losses {'ner': 12408.254685401917}
14 Losses {'ner': 12440.346844702959}
15 Losses {'ner': 11924.934330686927}
16 Losses {'ner': 11845.36424190551}
17 Losses {'ner': 11593.078968308866}
18 Losses {'ner': 11534.154551910433}
19 Losses {'ner': 11579.725727168552}
20 Losses {'ner': 11105.73508259654}
21 Losses {'ner': 11059.292191986926}
22 Losses {'ner': 1077

In [None]:
ner_model_negative = get_model('negative', negative_tweet_tr, more_iters=50)

Created blank 'en' model
('happpppy mothers day', {'entities': [(0, 20, 'selected_text')]})
0 Losses {'ner': 17817.433363507796}
1 Losses {'ner': 17753.405313515163}
2 Losses {'ner': 16018.647884458303}
3 Losses {'ner': 15733.070828437805}
4 Losses {'ner': 14729.110100502527}
5 Losses {'ner': 14185.262874782085}
6 Losses {'ner': 13574.499960422516}
7 Losses {'ner': 13096.708367347717}
8 Losses {'ner': 12478.775512218475}
9 Losses {'ner': 12319.517817020416}


In [28]:
negative_tweet_tr = []
for i in negative_tweet_train:
    if i[0] != '':
        negative_tweet_tr.append(i)

### Test model

In [120]:
test_dataset = pd.read_csv("test.csv")

In [68]:
test_dataset["text"] = test_dataset["text"].apply(lambda x: x.strip())
test_dataset['n_text_words'] = test_dataset['text'].apply(lambda text: len(str(text).split()))

In [108]:
pre_list = []
for i in range(test_dataset.shape[0]):
    t_data = test_dataset.iloc[i]
    if t_data.sentiment == 'neutral' or t_data.n_text_words <= 3:
        pre_list.append(t_data.text)
    elif t_data.sentiment == 'positive':
        pre_list.append(test_model(ner_model_positive, t_data.text))
    else:
        pre_list.append(test_model(ner_model_negative, t_data.text))

### Create submission file

In [109]:
submission = pd.read_csv("./sample_submission.csv")

In [111]:
submission['selected_text'] = pre_list

In [112]:
submission.head(10)

Unnamed: 0,textID,selected_text
0,f87dea47db,last session of the day
1,96d74cb729,good
2,eee518ae67,recession
3,01082688c6,happy bday
4,33987a8ee5,i like it
5,726e501993,thats great
6,261932614e,hates
7,afa11da83f,blocked
8,e64208b4ef,and within a short time of the last clue all o...
9,37bcad24ca,what did you get my day is alright havent don...


In [113]:
submission.to_csv("./submision_final.csv")