In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from fastai.text import *

In [None]:
from nltk.corpus import stopwords

def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split()
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
    return " ".join(clean_words)

In [None]:
train = pd.read_csv("../input/train.csv")
train['text'] = train['text'].str.replace("[^a-zA-Z]", " ")
train.text = train.text.apply(remove_stopwords)

train.head()

In [None]:
import seaborn as sns
sns.countplot("sentiment",data=train)


In [None]:
from collections import Counter
Counter(train.sentiment)

In [None]:
reps0 = [5 if val == 0 else 1 for val in train.sentiment]
train = train.loc[np.repeat(train.index.values, reps0)]


In [None]:
train.reset_index(inplace=True, drop=True)

In [None]:
reps1 = [4 if val == 1 else 1 for val in train.sentiment]
train = train.loc[np.repeat(train.index.values, reps1)]

In [None]:
train.reset_index(inplace=True, drop=True)
train

In [None]:
print(Counter(train.sentiment))
sns.countplot("sentiment",data=train)

In [None]:
test = pd.read_csv("../input/test.csv")
test['text'] = test['text'].str.replace("[^a-zA-Z]", " ")
test.text = test.text.apply(remove_stopwords)
test_id = test['unique_hash']

test.head()

In [None]:
# Create databunch
data = (TextList.from_df(train, cols='text')
                .split_by_rand_pct(0.2)
                .label_for_lm()  
                .databunch(bs=48))
data.show_batch()

**Fit the deep learning model with domain specific data
**

In [None]:
learn = language_model_learner(data, AWD_LSTM, drop_mult=0.3)

# select the appropriate learning rate
learn.lr_find()

# we typically find the point where the slope is steepest
learn.recorder.plot()

# Fit the model based on selected learning rate
learn.fit_one_cycle(5, 1e-2, moms=(0.8,0.7))

learn.unfreeze()
learn.fit_one_cycle(5, slice(2e-3/100, 2e-3))

# Save the encoder for use in classification
learn.save_encoder('fine_tuned_enc')

**Re-fit model with classification label**

In [None]:
label_cols = ['sentiment']

test_datalist = TextList.from_df(test, cols='text', vocab=data.vocab)

data_clas = (TextList.from_df(train, cols='text', vocab=data.vocab)
             .split_by_rand_pct(0.2)
             .label_from_df(cols= label_cols, classes=[0, 1, 2])
             .add_test(test_datalist)
             .databunch(bs=32))

data_clas.show_batch()

In [None]:

learn_classifier = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)

# load the encoder saved  
learn_classifier.load_encoder('fine_tuned_enc')
learn_classifier.freeze()

# select the appropriate learning rate
learn_classifier.lr_find()

# we typically find the point where the slope is steepest
learn_classifier.recorder.plot()

# Fit the model based on selected learning rate
learn_classifier.fit_one_cycle(5, 1e-2, moms=(0.8,0.7))

learn_classifier.unfreeze()
learn_classifier.fit_one_cycle(5, slice(2e-3/100, 2e-3))

learn_classifier.show_results()

**Get predictions**

In [None]:

preds, target = learn_classifier.get_preds(DatasetType.Test, ordered=True)
predictions = np.argmax(preds, axis=1) 

submission = pd.DataFrame({'unique_hash': test_id})
submission = pd.concat([submission, pd.DataFrame(predictions.numpy(), columns = label_cols)], axis=1)

submission.to_csv('submission.csv', index=False)
submission.tail()

In [None]:
## To download the submission file without Commiting the kernel.
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "subm.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(submission)