In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
del train_data['id']

train_data['keyword'].fillna('none',inplace=True)
train_data['location'].fillna('none',inplace=True)

test_data=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_data['keyword'].fillna('none',inplace=True)
test_data['location'].fillna('none',inplace=True)

train_data=train_data.sample(frac=1)
train_data.head()

In [None]:
def refine_text(text):
    text=re.sub(r'[^a-z\s]',r'',re.sub(r'http\S+',r'',re.sub(r'#','',text).lower()))
    return text

for col in ['keyword','location','text']:
    train_data[col]=train_data[col].apply(lambda x:refine_text(x))
    test_data[col]=test_data[col].apply(lambda x:refine_text(x))

train_data.head()

In [None]:
from sklearn.model_selection import train_test_split

train_data,val_data=train_test_split(train_data,test_size=0.1,shuffle=True,random_state=71)
len(train_data),len(val_data)

In [None]:
import spacy
spacy_nlp=spacy.load('en')

from nltk.corpus import stopwords
stopword_list=stopwords.words('english')

def tokenizer(text, MAX_LEN=20000):
    text=re.sub(' +',' ',re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]"," ",text))
    text=text if len(text)<=MAX_LEN else text[:MAX_LEN]
    return [x.text for x in spacy_nlp.tokenizer(text) if (x.text!=' ') and (x.text not in stopword_list)]

In [None]:
# https://nlp.stanford.edu/projects/glove/

from torchtext.vocab import Vectors,Vocab
from collections import Counter

gloveVectors=Vectors(name='/kaggle/input/glove6b/glove.6B.100d.txt')

for i in train_data.index:
    counter.update(tokenizer(train_data['text'][i]+' '+train_data['keyword'][i]+' '+train_data['location'][i]))
    
vocabulary=Vocab(counter,max_size=20000,min_freq=2,vectors=gloveVectors,specials=['<pad>','<unk>'])

print('Embedding vocab size: ', vocabulary.vectors.size(0))

In [None]:
import torchtext 

class ClassifyDataset(torchtext.data.Dataset):
    def __init__(self, df, fields, train=True, **kwargs):
        examples=[]
        for i, row in df.iterrows():
            examples.append(torchtext.data.Example.fromlist([row.text, row.target if train else None],fields))
        super().__init__(examples, fields, **kwargs)
        
    @staticmethod
    def sort_key(x):
        return len(x.text)
    
    @classmethod
    def splits(cls, fields, train_df=None, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data=(None, None, None)
        
        if train_df is not None:
            train_data=cls(train_df.copy(), fields, **kwargs)
            
        if val_df is not None:
            val_data=cls(val_df.copy(), fields, **kwargs)
            
        if test_df is not None:
            test_data=cls(test_df.copy(), fields, train=False, **kwargs)
            
        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [None]:
import torch

Text=torchtext.data.Field(tokenizer, include_lengths=True)
Label=torchtext.data.LabelField(dtype=torch.float)

fields= [('text', Text),('label', Label)]

train_ds, val_ds, test_ds= ClassifyDataset.splits(fields, train_df=train_data, val_df=val_data, test_df=test_data)

#sampling random example
print(vars(train_ds[61]), vars(val_ds[61]))

In [None]:
Text.build_vocab(train_ds,
                max_size=20000,
                vectors=gloveVectors,
                unk_init=torch.Tensor.zero_)

Label.build_vocab(train_ds)

In [None]:
batch_size=64
device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator= torchtext.data.BucketIterator.splits(
                                (train_ds, val_ds),
                                batch_size=batch_size,
                                sort_within_batch=True,
                                device=device)

## Model - Simple LSTM

In [None]:
num_epochs=20
lr=0.001

input_dims=len(Text.vocab)
embedding_dims=100
hidden_dims=256
output_dims=1
n_layers=2
bidirectional=True
drop=0.2

pad_idx=Text.vocab.stoi[Text.pad_token]

In [None]:
def accuracy(preds, y):
    rounded_preds=torch.round(torch.sigmoid(preds))
    correct=(rounded_preds==y).float()
    return correct.sum()/len(correct)

class LSTMnn(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dims, hidden_dims, output_dims, n_layers, bidirectional,pad_idx, dropout):
        super().__init__()
        self.embeddings=torch.nn.Embedding(vocab_size, embedding_dims, padding_idx=pad_idx)
        self.lstm=torch.nn.LSTM(embedding_dims, hidden_dims,
                               num_layers=n_layers,
                               bidirectional=bidirectional,
                               dropout=dropout)
        self.fc1=torch.nn.Linear(hidden_dims*2, hidden_dims)
        self.fc2=torch.nn.Linear(hidden_dims, output_dims)
        self.dropout=torch.nn.Dropout(dropout)
        
    def forward(self, text, text_lengths, train=True):
        
        #text and text_lengths : [seq_len, batch_size] and [batch_size]
        embedding=self.embeddings(text) #[seq_len, batch_size, emb_size]
        packed_embeddings=torch.nn.utils.rnn.pack_padded_sequence(embedding, text_lengths)
        packed_out,(hidden,cell)=self.lstm(packed_embeddings)
        #hidden:[num_layers*num_dir, batch size, hidden dims]
        hidden=self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        
        output=self.fc1(hidden)
        output=self.dropout(self.fc2(output))
        
        return output
        
model=LSTMnn(vocab_size=input_dims,embedding_dims=embedding_dims, 
             hidden_dims=hidden_dims,output_dims=output_dims,
             n_layers=n_layers, bidirectional=bidirectional,
             pad_idx=pad_idx, dropout=drop)

model.embeddings.weight.data.copy_(Text.vocab.vectors)

model.to(device)

criterion=torch.nn.BCEWithLogitsLoss()
optimizer=torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
# for i in train_iterator:
#     model(i.text[0], i.text[1], train=True)
#     break
# torch.Size([1, 4]) tensor([4]) torch.Size([1])
# torch.Size([1, 4, 100]) torch.Size([1])

In [None]:
def train(model, iterator):
    
    epoch_loss=0
    epoch_acc=0
    
    model.train()
    
    for batch in iterator:
        text, text_len=batch.text
        
        optimizer.zero_grad()
        pred=model(text, text_len).squeeze(1)
        
        loss=criterion(pred, batch.label)
        acc=accuracy(pred, batch.label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss+=loss.item()
        epoch_acc+=acc.item()
    
    return (epoch_loss/len(iterator), epoch_acc/len(iterator))

def evaluate(model, iterator):

    epoch_acc=0
    
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, text_len=batch.text

            pred=model(text, text_len).squeeze(1)

            acc=accuracy(pred, batch.label)
            
            epoch_acc+=acc.item()
    
    return epoch_acc/len(iterator)

In [None]:
import time
t = time.time()
loss=[]
acc=[]
val_acc=[]

for epoch in range(num_epochs):
    
    train_loss, train_acc = train(model, train_iterator)
    valid_acc = evaluate(model, valid_iterator)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Acc: {valid_acc*100:.2f}%')
    
    loss.append(train_loss)
    acc.append(train_acc)
    val_acc.append(valid_acc)
    
print(f'time:{time.time()-t:.3f}')

In [None]:
print(model)

In [None]:
# torch.save(model.state_dict(),'model.pt')

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(len(loss)), loss,color='red',label='loss')
plt.plot(range(len(acc)), acc,color='blue',label='acc')
plt.plot(range(len(val_acc)), val_acc,color='green',label='val_acc')
plt.legend()
plt.show();

In [None]:
def infer(text):
    text_arr=[]
    for i in tokenizer(text):
        text_arr.append(Text.vocab.stoi[i])
        
    if len(text_arr)>0:
        model.eval()
        with torch.no_grad():
#           gpu
#           text=torch.LongTensor([text_arr]).view(-1,1).to(device)
#           text_len=torch.LongTensor([text.shape[1]]).to(device)
            # cpu
            text=torch.LongTensor([text_arr]).view(-1,1)
            text_len=torch.LongTensor([text.shape[1]])
            return int(torch.round(torch.sigmoid(model(text, text_len).squeeze(1))).item())
    else:
        return 0

test_preds=[]
for i in test_data.iterrows():
    test_preds.append(infer(i[1]['text']))

In [None]:
my_submissions=pd.DataFrame({'id':test_data['id'].values,'target':test_preds})
my_submissions.to_csv('submission.csv', index=False)

# fasttext

In [146]:
import numpy as np
import pandas as pd
import re
import csv

import fasttext

In [147]:
train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
submit=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [148]:
# train.isna().sum()

train['keyword']=train['keyword'].fillna('none')
train['location']=train['location'].fillna('none')
test['keyword']=test['keyword'].fillna('none')
test['location']=test['location'].fillna('none')

In [149]:
from sklearn.model_selection import train_test_split

Train=train.drop('target',axis=1)
Target=train['target']

X_tr,X_val,y_tr,y_val=train_test_split(Train,Target,test_size=0.15,random_state=71,stratify=train['target'])

In [150]:
tr_arr=[]
val_arr=[]
test_arr=[]

for i,row in X_tr.iterrows():
    target=y_tr.loc[i]
    label=f'__label__{target}'
    text=row['keyword']+' '+row['location']+' '+row['text']
    label+=' '+text
    tr_arr.append(label)
    
for i,row in X_val.iterrows():
    text=row['keyword']+' '+row['location']+' '+row['text']
    val_arr.append(text)
    
for i,row in test.iterrows():
    text=row['keyword']+' '+row['location']+' '+row['text']
    test_arr.append(text)

In [151]:
train_df=pd.DataFrame(train_np)
train_df.to_csv('train.txt',index=False,sep=' ',header=False,quoting=csv.QUOTE_NONE,quotechar="",escapechar=" ")

In [152]:
model=fasttext.train_supervised('train.txt',label_prefix='__label__',epoch=10)
print(model.labels,'are the labels or targets the model is predicting')

['__label__0', '__label__1'] are the labels or targets the model is predicting


In [153]:
from sklearn.metrics import accuracy_score

val_arr=[re.sub(r'\n','',text) for text in val_arr]

pred=[int(label[0][-1]) for label in model.predict(val_arr)[0]]
print(f'val_acc : {accuracy_score(pred,y_val.values)}')

acc : 0.9807355516637478


In [154]:
test_arr=[re.sub(r'\n','',text) for text in test_arr]

pred=[int(label[0][-1]) for label in model.predict(test_arr)[0]]

In [155]:
submit['target']=pred
submit.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [156]:
submit.to_csv('submission.csv',index=False)