In [None]:
from fastai import *
from fastai.text import *
from pathlib import Path
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score


import torch
print("Cuda available" if torch.cuda.is_available() is True else "CPU")
print("PyTorch version: ", torch.__version__)

In [None]:
train_df = pd.read_csv('../input/train.csv', nrows=100)
test_df = pd.read_csv('../input/test.csv', nrows=10)

In [None]:
def clean_text(text):
    return (text.str.lower()
                         .str.replace('\?+', ' ?')
                         .str.replace('\!+', ' !')
                         .str.replace('#', '# ')
                         .str.replace('@', '@ ')
                         .str.replace(':\)', '>')
                         .str.replace('won\'t', 'will not')
                         .str.replace('can\'t', 'can not')
                         .str.replace('it\'s', 'it is')
                         .str.replace('that\'s', 'that is')
                         .str.replace('\'s', '')
                         .str.replace('n\'t', ' not')
                         .str.replace('\'re', ' are')
                         .str.replace('\'d', ' would')
                         .str.replace('\'ll', ' will')
                         .str.replace('\'t', ' not')
                         .str.replace('\'ve', ' have')
                         .str.replace('\'m', ' am')
                         .str.replace(r'sh\*tty', 'shit')
                         .str.replace('[\'\":();,.\-—/_]', ' ')
                         .str.replace(r'(ha|hha|hhha)+', 'ha')
                         .str.replace(r'\bur\b', 'you are')
                         .str.replace(r'f+u+', 'fu')
                         .str.replace(r'\*', '')
                         .str.replace(r'%', ' %')
                         .str.replace(' iv ', ' 4 ')
                         .str.replace(' cc ', ' civil comments ')
                         .str.replace(' ww ', ' willamette week ')
                         .str.replace(r'\$+', '$ ')
                         .str.replace('&', ' and ')
                         .str.replace(' os x ', ' osx ')
                         .str.replace('\s+', ' ')
).str.strip()

In [None]:
train_df['comment_text_clean'] = clean_text(train_df['comment_text'])
print('train is done')

test_df['comment_text_clean'] = clean_text(test_df['comment_text'])
print('test is done')

In [None]:
len(train_df), len(test_df)

In [None]:
train_df['target_round'] = (train_df['identity_annotator_count'] >= 2) & (train_df['target'] >= 0.4)

In [None]:
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['target_round'])

In [None]:
data_lm = TextLMDataBunch.from_df(
    path='',
    train_df=train_df,
    valid_df=val_df,
    test_df=test_df,
    text_cols=['comment_text_clean'],
    label_cols=['target_round'],
    #label_cols=['target_better'],
    #classes=['target_better'],
    min_freq=3
)

In [None]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.8)

In [None]:
learn.lr_find(start_lr=1e-6, end_lr=1e2)
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(cyc_len=3, max_lr=1e-01)

In [None]:
learn.unfreeze()
learn.fit_one_cycle(cyc_len=10, max_lr=1e-3, moms=(0.8, 0.7))

In [None]:
learn.save_encoder('ft_enc')

In [None]:
data_class = TextClasDataBunch.from_df(
    path='',
    train_df=train_df,
    valid_df=val_df,
    test_df=test_df,
    text_cols=['comment_text_clean'],
    label_cols=['target_round'],
    #label_cols=['target_better'],
    min_freq=3,
    vocab=data_lm.train_ds.vocab,
    #label_delim=' '
)

In [None]:
learn = text_classifier_learner(data_class, arch=AWD_LSTM, drop_mult=0.8)
learn.load_encoder('ft_enc')
learn.freeze()

In [None]:
learn.lr_find(start_lr=1e-8, end_lr=1e2)
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(cyc_len=3, max_lr=1e-005)

In [None]:
oof = learn.get_preds(ds_type=DatasetType.Valid)
o = oof[0]
l = oof[1]

accuracy_score(l,o[:,1]>0.5), roc_auc_score(l,o[:,1])

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(3, slice(1e-4,1e-2))

In [None]:
oof = learn.get_preds(ds_type=DatasetType.Valid)
o = oof[0]
l = oof[1]

accuracy_score(l,o[:,1]>0.5), roc_auc_score(l,o[:,1])

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(3, slice(1e-5,5e-3))

In [None]:
oof = learn.get_preds(ds_type=DatasetType.Valid)
o = oof[0]
l = oof[1]

accuracy_score(l,o[:,1]>0.5), roc_auc_score(l,o[:,1])

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, slice(1e-5,1e-3))

In [None]:
oof = learn.get_preds(ds_type=DatasetType.Valid)
o = oof[0]
l = oof[1]

accuracy_score(l,o[:,1]>0.5), roc_auc_score(l,o[:,1])

In [None]:
preds = learn.get_preds(ds_type=DatasetType.Test, ordered=True)

In [None]:
p = preds[0][:,1]

In [None]:
test_df['prediction'] = p

In [None]:
test_df.sort_values('prediction', inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [None]:
ii = 9993
print(test_df['comment_text_clean'][ii])
print(test_df['prediction'][ii])

In [None]:
train_df['comment_text'][4595]

In [None]:
train_df[train_df['target'] > 0.005].sort_values('target')[['comment_text', 'target']]