In [100]:
import numpy as np
import pandas as pd
import re, string
from sklearn import feature_extraction, model_selection
import torch
from torch import nn
from torch.functional import F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import torchmetrics as tm

# Load Datasets

In [101]:
path = './data/'
train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')

In [102]:
train_df[train_df['target'] == 0]['text'].values[0]

"What's up man?"

In [103]:
train_df[train_df['target'] == 1]['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

# Cleaning

Removing URLs:

In [104]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

# test
example = 'baidu: https://www.baidu.com'
remove_url(example)

'baidu: '

In [105]:
train_df['text'] = train_df['text'].apply(lambda x: remove_url(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_url(x))

Removing HTML tags:

In [106]:
def remove_tags(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

# test
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""
print(remove_tags(example))


Real or Fake
Kaggle 
getting started



In [107]:
train_df['text'] = train_df['text'].apply(lambda x: remove_tags(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_tags(x))

Removing Emojis:

In [108]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

'Omg another Earthquake '

In [109]:
train_df['text'] = train_df['text'].apply(lambda x: remove_emoji(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_emoji(x))

Rmoving numbers:

In [110]:
train_df['text'] = train_df['text'].str.replace('\d+', '')
test_df['text'] = test_df['text'].str.replace('\d+', '')

  train_df['text'] = train_df['text'].str.replace('\d+', '')
  test_df['text'] = test_df['text'].str.replace('\d+', '')


Removing punctuations:

In [111]:
def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

example = 'I\'m a *king'
remove_punct(example)

'Im a king'

In [112]:
train_df['text'] = train_df['text'].apply(lambda x: remove_punct(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_punct(x))

Removing multiple spaces:

In [113]:
train_df['text'] = train_df['text'].str.replace('   ', ' ')
train_df['text']=train_df['text'].str.replace('     ', ' ')
train_df['text']=train_df['text'].str.replace('\xa0 \xa0 \xa0', ' ')
train_df['text']=train_df['text'].str.replace('  ', ' ')
train_df['text']=train_df['text'].str.replace('—', ' ')
train_df['text']=train_df['text'].str.replace('–', ' ')

In [114]:
test_df['text'] = test_df['text'].str.replace('   ', ' ')
test_df['text']=test_df['text'].str.replace('     ', ' ')
test_df['text']=test_df['text'].str.replace('\xa0 \xa0 \xa0', ' ')
test_df['text']=test_df['text'].str.replace('  ', ' ')
test_df['text']=test_df['text'].str.replace('—', ' ')
test_df['text']=test_df['text'].str.replace('–', ' ')

# Dataset

In [115]:
print(train_df.shape, test_df.shape)

# split the data into train and validation
train, val = model_selection.train_test_split(train_df, test_size=0.2, random_state=42)
print(train.shape, val.shape)

(7613, 5) (3263, 4)
(6090, 5) (1523, 5)


In [116]:
count_vectorizer = feature_extraction.text.CountVectorizer()
eg_count_vectorizer = count_vectorizer.fit_transform(train_df['text'][0:5])

print(eg_count_vectorizer[0].todense().shape)
print(eg_count_vectorizer[0].todense())

(1, 52)
[[0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0
  0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [117]:
train_data = count_vectorizer.fit_transform(train['text'])
val_data = count_vectorizer.fit_transform(val['text'])
test_data = count_vectorizer.transform(test_df['text'])

train_data.shape, val_data.shape, test_data.shape

((6090, 14740), (1523, 6047), (3263, 6047))

In [124]:
class MyDataset(Dataset):
    def __init__(self, x, y=None):
        self.x = x
        if y is not None:
            self.y = y
    def __len__(self):
        return self.x.shape[0]
    def __getitem__(self, index):
        X = self.x[index]
        if self.y is not None:
            Y = self.y[index]
        return X, Y

In [125]:
training_set = MyDataset(train_data, train['target'])
val_set = MyDataset(val_data, val['target'])
# test_set = MyDataset(test_data, test_df)

# Model

In [126]:
class Classifier(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()

        self.params = hparams
        self.lr = self.params['lr']
        self.batch_size = self.params['batch_size']
        self.weight_decay = self.params['weight_decay']
        self.train_acc = tm.Accuracy()
        self.val_acc = tm.Accuracy()
        
        # define the model
        self.l1 = nn.Linear(10, 96)
        self.l2 = nn.Linear(96, 64)
        self.l3 = nn.Linear(64, 48)
        self.l4 = nn.Linear(48, 24)
        self.l5 = nn.Linear(24, 2)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = F.relu(self.l3(x))
        x = F.relu(self.l4(x))
        x = self.l5(x)
        return x

    def configure_optimizers(self):
        optimzer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        return optimzer

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y.long().squeeze())
        
        self.train_acc(y_hat, y.long().squeeze())
        self.log('step', self.trainer.current_epoch)
        self.log('train_acc', self.train_acc)
        self.log('loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        self.val_acc(y_hat, y.long().squeeze())
        self.log('step', self.trainer.current_epoch)
        self.log('val_acc', self.val_acc)

    def train_dataloader(self):
        return DataLoader(training_set, batch_size=self.batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        return DataLoader(val_set, batch_size=self.batch_size, shuffle=False, num_workers=4)

    def test_dataloader(self):
        return DataLoader(test_set, batch_size=self.batch_size, shuffle=False, num_workers=4)

In [None]:
hparams = {
    'lr': 1e-4,
    'batch_size': 256,
    'weight_decay': 1e-2
}
model = Classifier(hparams)
trainer = pl.Trainer(max_epochs=50)
trainer.fit(model)