In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
cd kaggle/

## Spacy custom pipeline

In [5]:
import spacy
import random
import re
import pandas as pd
from spacy.pipeline.textcat import Config, single_label_cnn_config, single_label_bow_config, single_label_default_config
from spacy.training.example import Example
from spacy.util import minibatch
from sklearn.model_selection import train_test_split

In [6]:
spacy.prefer_gpu()
spacy.require_gpu()

In [7]:
def clean_text(text):
    
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    
    return text

In [8]:
df = pd.read_csv('input/imdbfinal/imdbsentiment.csv', header=None,error_bad_lines=False)
df.columns=['text', 'sentiment']
df['sentiment'].value_counts()

In [9]:
df, test = train_test_split(df, train_size=0.9, shuffle=True, stratify=df['sentiment'])

In [10]:
train_texts = df['text'].values
train_labels = [{'cats': {'positive': label == 'positive',
                          'negative': label == 'negative'}} 
                for label in df['sentiment']]

In [11]:
train_data = list(zip(train_texts, train_labels))
len(train_data)

In [12]:
# Create an empty model
nlp = spacy.blank("en")
config = Config().from_str(single_label_bow_config)
text_cat = nlp.add_pipe('textcat', config=config, last=True)
text_cat.add_label("positive")
text_cat.add_label("negative")

In [13]:
%%time

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(25):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        texts, annotations = zip(*batch)
            
        example = []
        # Update the model with iterating each text
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
            
            # Update the model
        nlp.update(example, drop=0.5, losses=losses)
    print(losses)

In [14]:
nlp.to_disk('working/sentiment_model')

In [15]:
text_nlp = spacy.load("working/sentiment_model")

In [17]:
test_texts = test['text']
test_te = list(test_texts)

In [18]:
input_text = input()
doc = text_nlp(input_text)

In [19]:
doc.cats

In [None]:
!zip -r working/sentiment_model.zip working/sentiment_model/