In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
stopwords.words('english')

In [None]:
df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv', encoding='utf-8').fillna('')
len(df)

In [None]:
df.columns
# df = df.drop(['id'], axis=1)
print(df.head())
print(list(set(df['location'].tolist())))

In [None]:
def cleaning(df, keyword_col, text_col, input_col, location_col):
    df[input_col] = df[text_col] + ' ' + df[keyword_col]# + ' ' + df[location_col]
    df[input_col] = df[input_col].apply(lambda x: str(x).lower())
    df[input_col] = ' ' + df[input_col] + ' '
    df[input_col] = df[input_col].replace(r'http.*\s', ' ', regex=True)
    df[input_col] = df[input_col].replace(r'\%20', ' ', regex=True)
    
    df[input_col] = df[input_col].replace(r'\\x.*\s', '', regex=True)
    df[input_col] = df[input_col].replace(r'\n', ' ', regex=True)
    df[input_col] = df[input_col].replace(r'\.|\'', '', regex=True)
    df[input_col] = df[input_col].replace(r'\.|\#|\-|\!|\?|\;|\)|\(|\:|\@|\'|\[|\]|\&|\||\/', ' ', regex=True)
    stop = [i.lower() for i in stopwords.words('english')]
    df['temp'] = df[input_col].apply(lambda x: str(x).split())
    df['temp'] = df['temp'].apply(lambda x: [i for i in x if i not in stop])
    df[input_col] = df['temp'].apply(lambda x: ' '.join(x))
    return df
    

In [None]:
df = cleaning(df, 'keyword', 'text', 'input','location')
df = df[['target', 'input']]

In [None]:
vectorizer = TfidfVectorizer(min_df = 2, ngram_range=[1,2])
vectorizer.fit(df['input'])
vocab_length = len(vectorizer.get_feature_names())
vocab_length
# msk = np.random.rand(len(df)) < 0.8
# df_train = df[msk]
# df_test = df[~msk]

In [None]:
class textData(Dataset):
    def __init__(self, df, vectorizer):
        self.x = torch.from_numpy(vectorizer.transform(df['input'].values).astype(np.float32).todense())
        self.y = torch.from_numpy(df['target'].values.astype(np.float32))
        self.m = self.x.shape[0]
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.m
    
dataset_train = textData(df, vectorizer)
dataloader_train = DataLoader(dataset=dataset_train, batch_size=32, shuffle=True)


In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    print('cuda')
else:
    device = torch.device('cpu')
    print('cpu')

In [None]:
class Net(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()        
        self.fc1 = nn.Linear(vocab_length, 16)
        self.fc2 = nn.Linear(16, 32)
        self.fc3 = nn.Linear(32, 64)
        self.fc4 = nn.Linear(64, 128)
        self.dropout = nn.Dropout(0.7)
#         self.normalize = nn.BatchNorm1d()
        self.fc5 = nn.Linear(128, 1024)

        self.fc8 = nn.Linear(1024, 1)
    
    
    def forward(self, x):      
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.dropout(x)
#         x = self.normalize(x)
        x = F.relu(self.fc5(x))
        x = self.dropout(x)
#         x = self.normalize(x)
        x = torch.sigmoid(self.fc8(x))
        
        return x
    

model = Net(vocab_length).to(device)
optimizer = optim.Adam(model.parameters())
loss_function = nn.BCEWithLogitsLoss()
EPOCHS = 8

In [None]:
def train(dataloader, model, loss_function, optimizer, EPOCHS, vocab_size):
    losses = []
    for epoch in range(EPOCHS):
        epoch_losses = []
        for i, (inputs, labels) in tqdm(enumerate(dataloader)):
            inputs = inputs.to(device)
            labels = labels.view(-1,1).to(device)    
            model.zero_grad()
            optimizer.zero_grad()
            y_hat = model.forward(inputs)
            loss = loss_function(y_hat, labels)
            loss.backward()
            optimizer.step()
            epoch_losses.append(loss)
            if 32 % 16 ==0:
                losses.append(loss)
        print(np.min(epoch_losses))
    return model, losses    

In [None]:
def validate(model, dataloader, vocab_size):
    correct = 0
    total = 0
    with torch.no_grad():
        for i, (inputs, labels) in tqdm(enumerate(dataloader)):
            real_class = labels.view(-1, 1).to(device)
            net_out = model(inputs.view(-1,vocab_size).to(device))
            predicted_class = torch.round(net_out)
            correct +=  torch.sum(predicted_class == real_class).item()
            total += inputs.shape[0]
    print(correct/total)


In [None]:
model, losses = train(dataloader_train, model, loss_function, optimizer, EPOCHS, vocab_length)

In [None]:
plt.plot(losses)

In [None]:
validate(model, dataloader_train, vocab_length)
# validate(model, dataloader_test, vocab_length)


In [None]:
df_test_orig = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
print(df_test_orig.columns)
df_test = df_test_orig.drop(['id'], axis=1)

In [None]:
df_test = cleaning(df_test, 'keyword', 'text', 'input', 'location')
df_test = df_test[[ 'input']]
df_test['target'] = 0.0

In [None]:
dataset_test = textData(df_test, vectorizer)
dataloader_test = DataLoader(dataset=dataset_test, batch_size=32)

In [None]:
def test(model, dataloader, vocab_size):
    Y = []
    with torch.no_grad():
        for i, (inputs, labels) in tqdm(enumerate(dataloader)):
            net_out = model(inputs.view(-1,vocab_size).to(device))
            predicted_class = torch.round(net_out).view(-1).detach().cpu().numpy()
            Y += list(predicted_class)
    return Y        
Y = test(model, dataloader_test, vocab_length)

In [None]:
df_submit = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv').fillna('')
df_submit.head()

In [None]:
print(Y)

In [None]:
Y = [int(i) for i in Y]
Y

In [None]:
print(df_test_orig.columns)
df_test_orig['target'] = Y

In [None]:
print(df_test_orig.head())
df_test_orig = df_test_orig[['id', 'target']]

In [None]:
df_test_orig.to_csv('/kaggle/working/result.csv', index=False)