In [7]:
import pandas as pd
train = pd.read_csv("train.csv")

In [8]:
train.head(5)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


## statistics 
### Explore number of sentiments

In [9]:
import matplotlib.pyplot as plt

train.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

### clean text, remove url link, stop words, stem words

In [10]:
import re
import string
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [11]:
train["text"] = train["text"].apply(lambda x: clean_text(x))
train['selected_text'] = train['selected_text'].apply(lambda x:clean_text(x))

In [6]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
nltk.download('punkt')
#word_tokenize accepts a string as an input, not a file. 
stop_words = set(stopwords.words('english')) 

def remove_stopwords(words):
    text = []
    for word in word_tokenize(words):
        if not word in stop_words:
            text.append(word)
    return ' '.join(text)

train["text"] = train["text"].apply(lambda x: remove_stopwords(x))
train['selected_text'] = train['selected_text'].apply(lambda x:remove_stopwords(x))

[nltk_data] Error loading stopwords: <urlopen error [Errno 110]
[nltk_data]     Connection timed out>


KeyboardInterrupt: 

In [12]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer() 

def stem_word(words):
    text = []
    for word in word_tokenize(words):
        text.append(ps.stem(word))
    return " ".join(text)
train["text"] = train["text"].apply(lambda x: stem_word(x))
train['selected_text'] = train['selected_text'].apply(lambda x:stem_word(x))

In [13]:
from collections import Counter
cnt = Counter()
for line in range(len(train)):
    for word in word_tokenize(train.iloc[line, 1]):
        cnt[word] += 1

In [14]:
cnt.most_common(20)

[('im', 3023),
 ('day', 2368),
 ('go', 2363),
 ('get', 1908),
 ('good', 1569),
 ('work', 1483),
 ('love', 1456),
 ('like', 1454),
 ('got', 1238),
 ('dont', 1200),
 ('today', 1114),
 ('time', 1078),
 ('one', 1055),
 ('cant', 1020),
 ('happi', 993),
 ('want', 981),
 ('know', 967),
 ('miss', 961),
 ('thank', 961),
 ('lol', 950)]

The most common words in the dataset exprims some sentiments like love/like, good, don't, miss etc...

In [14]:
Positive_sent = train[train['sentiment']=='positive']
Negative_sent = train[train['sentiment']=='negative']
Neutral_sent = train[train['sentiment']=='neutral']

In [16]:
from collections import Counter
cnt = Counter()
for line in range(len(Positive_sent)):
    for word in word_tokenize(Positive_sent.iloc[line, 2]):
        cnt[word] += 1
cnt.most_common(10)

[('love', 910),
 ('good', 832),
 ('happi', 739),
 ('thank', 633),
 ('day', 471),
 ('great', 364),
 ('hope', 335),
 ('fun', 287),
 ('nice', 270),
 ('mother', 269)]

In [17]:
from collections import Counter
cnt = Counter()
for line in range(len(Negative_sent)):
    for word in word_tokenize(Negative_sent.iloc[line, 2]):
        cnt[word] += 1
cnt.most_common(10)

[('miss', 550),
 ('im', 452),
 ('sad', 352),
 ('sorri', 302),
 ('hate', 273),
 ('bad', 246),
 ('feel', 242),
 ('dont', 221),
 ('suck', 217),
 ('cant', 201)]

In [18]:
from collections import Counter
cnt = Counter()
for line in range(len(Neutral_sent)):
    for word in word_tokenize(Neutral_sent.iloc[line, 2]):
        cnt[word] += 1
cnt.most_common(10)

[('go', 1041),
 ('im', 1040),
 ('get', 806),
 ('work', 633),
 ('day', 622),
 ('got', 526),
 ('dont', 482),
 ('like', 470),
 ('time', 455),
 ('want', 441)]

### Use spacy to create NER

In [15]:
def get_train_datas(data):
    train_datas = []
    texts = data.text
    selected_texts = data.selected_text
    for selected_text, text in zip(selected_texts, texts):
        start = text.find(selected_text)
        end = start + len(selected_text)
        train_datas.append((text, {"entities":[(start, end, "selected_text")]}))
    return train_datas

In [16]:
positive_tweet = get_train_datas(Positive_sent)
negative_tweet = get_train_datas(Negative_sent)

In [17]:
def load_model(pre_model = None, label = None):
    if pre_model is not None:
        nlp = spacy.load(pre_model)
        print("Loaded model '%s'" % pre_model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

        if "ner" not in nlp.pipe_names:
            ner = nlp.create_pipe("ner")
            nlp.add_pipe(ner)
        else:
            ner = nlp.get_pipe("ner")

        if label is not None:
            ner.add_label(label)

    return nlp

In [18]:
def train_model(model, nlp, train_datas, n_iter = 30):
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

    with nlp.disable_pipes(*other_pipes):
        sizes = compounding(1.0, 64.0, 1.001)
        print(train_datas[0])
        for itn in range(n_iter):
            random.shuffle(train_datas)
            batches = minibatch(train_datas, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print(itn, "Losses", losses)

    return nlp

In [19]:
import os
def test_model(ner_model, text):
    doc = ner_model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
        
    return text[ent_array[0][0]:ent_array[0][1]] if len(ent_array) > 0 else text


def save_model(ner_model, output_dir = None, new_model_name = None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        ner_model.meta["name"] = new_model_name
        ner_model.to_disk(output_dir)
        print("Saved model to", output_dir)

In [20]:
def get_model(sentiment, train_datas, more_iters = 30):
    if sentiment == 'positive':
        positive_model_path = "./models/positive_ner/"
        positive_datas = train_datas
        if not os.path.exists(positive_model_path):
            nlp = load_model(label = 'selected_text')
            ner_model_positive = train_model(None, nlp, positive_datas, n_iter=50)
            save_model(ner_model_positive, output_dir = "./models/positive_ner/", new_model_name = "posi_model")
        else:
            ner_model_positive = load_model(positive_model_path)
#             ner_model_positive = spacy.load("/kaggle/working/models")
            if more_iters > 0:
                ner_model_positive = train_model(positive_model_path, ner_model_positive, positive_datas, more_iters)
                save_model(ner_model_positive, output_dir = "./", new_model_name = "posi_model")
        return ner_model_positive
    else:
        negative_model_path = "./"
        negative_datas = train_datas
        if not os.path.exists(negative_model_path):
            nlp = load_model(label = 'selected_text')
            ner_model_negative = train_model(None, nlp, negative_datas, n_iter=50)
            save_model(ner_model_negative, output_dir = "./", new_model_name = "nega_model")
        else:
            ner_model_negative = load_model(negative_model_path)
#             ner_model_negative = spacy.load("/kaggle/working/models_nega")
            if more_iters > 0:
                ner_model_negative = train_model(negative_model_path, ner_model_negative, negative_datas, more_iters)
                save_model(ner_model_negative, output_dir = "./", new_model_name = "nega_model")
        return ner_model_negative

In [24]:
!pip install spacy

[31mERROR: Could not find a version that satisfies the requirement spacy (from versions: none)[0m
[31mERROR: No matching distribution found for spacy[0m


In [22]:
from __future__ import unicode_literals, print_function
import spacy
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
ner_model_positive = get_model('positive', positive_tweet, more_iters=50)

SyntaxError: from __future__ imports must occur at the beginning of the file (cell_name, line 5)