In [1]:
import re
from itertools import product
from collections import Counter

from tqdm.auto import tqdm

import pandas as pd
import numpy as np

import torch
from torch import nn
from torch.utils.data import DataLoader


from models.WD_Dataset import WD_Dataset 
from models.Wide_Deep import Wide_Deep

In [2]:
DATA_PATH = "./data/"
CHECKPOINT_DIR = "./checkpoints/"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(DEVICE)

cuda


In [3]:
orig_features = [
    'text_tokens',    ###############
    'hashtags',       #Tweet Features
    'tweet_id',       #
    'media',          #
    'links',          #
    'domains',        #
    'tweet_type',     #
    'language',       #
    'timestamp',      ###############
    'engaged_with_user_id',              ###########################
    'engaged_with_user_follower_count',  #Engaged With User Features
    'engaged_with_user_following_count', #
    'engaged_with_user_is_verified',     #
    'engaged_with_user_account_creation', ###########################
    'engaging_user_id',                  #######################
    'engaging_user_follower_count',      #Engaging User Features
    'engaging_user_following_count',     #
    'engaging_user_is_verified',         #
    'engaging_user_account_creation',    #######################
    'engagee_follows_engager',    #################### Engagement Features
    'reply',          #Target Reply
    'retweet',        #Target Retweet    
    'retweet_comment',#Target Retweet with comment
    'like',           #Target Like
                      ####################
]

target_features = orig_features[-4:]
numerical_features = ['engaged_with_user_follower_count', 'engaged_with_user_following_count', 
                      'engaging_user_follower_count', 'engaging_user_following_count', 'url_cnt',
                      'char_cnt', 'hashtag_cnt', 'Photo_cnt', 'Video_cnt', 'GIF_cnt']
categorical_features = ['language', 'engaged_with_user_id', 'engaging_user_id', 'tweet_type']


cat_target_prod = product(categorical_features, target_features)
features = []
for (cat, target) in cat_target_prod:
    features.append(cat+"_"+target+"_TE")

m = 20
MAX_LEN = 100

In [4]:
train_df = pd.read_csv(DATA_PATH+"train.csv")
valid_df = pd.read_csv(DATA_PATH+"valid.csv")

In [5]:
def create_dataset(df, numerical_features=numerical_features,
                   features=features, targets=target_features):
    all_features = numerical_features + features
    
    feats = df.loc[:,all_features].values
    target_values = df.loc[:, targets].values
    
    return WD_Dataset(feats, target_values)

In [6]:
train_dataset = create_dataset(train_df)
valid_dataset = create_dataset(valid_df)

In [7]:
model = Wide_Deep(dim_features=len(numerical_features + features),
                                dim_hidden=[768,512,256,128,64,32]).to(DEVICE)

In [8]:
EPOCH_COUNT = 3
BATCH_SIZE = 24

optimizer = torch.optim.Adam(model.parameters(), lr=7e-5)
loss_fn = nn.BCEWithLogitsLoss()

In [9]:
def calc_valid_loss(model, valid_loss, batch_size, loss_fn):
    model.eval()
    loss_list = []
    dataloader = DataLoader(valid_dataset, batch_size=batch_size, 
                            shuffle=False, drop_last=False)
    
    for data in tqdm(dataloader):
        features = data['features'].to(DEVICE)

        labels = data['labels'].to(DEVICE)
        
        logits = model(features)
        loss = loss_fn(logits, labels)
        
        loss_list.append(loss.detach().cpu())
        
    
    
    model.train()
    return np.mean(loss_list)


def train_model(model, train_dataset, valid_datset, optimizer, loss_fn, 
                batch_size=BATCH_SIZE, epochs=EPOCH_COUNT):
    model.train()
    
    for epoch in range(3):
        dataloader = DataLoader(train_dataset, batch_size=batch_size,
                                shuffle=True, drop_last=False)
        avg_loss = []
        best_valid_loss = None
        for step, data in enumerate(tqdm(dataloader)):
            features = data['features'].to(DEVICE)

            labels = data['labels'].to(DEVICE)

            logits = model(features)
            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()
            
            avg_loss.append(loss.detach().cpu())
            
            if(step % 75000 == 0):
                train_loss = np.mean(avg_loss)
                valid_loss = calc_valid_loss(model, valid_dataset, batch_size, loss_fn)
                print("Epoch: {}, step: {}, train_loss: {}, valid_loss: {}"
                     .format(epoch, step, train_loss, valid_loss))
                torch.save(model.state_dict(),(CHECKPOINT_DIR+"WD_epoch_{}_step_{}").format(epoch, step))
                
                #if(best_valid_loss is None or valid_loss < best_valid_loss):
                 #   model.save_pretrained(CHECKPOINT_DIR+"best_valid_loss")
                  #  best_valid_loss = valid_loss 

            
            optimizer.zero_grad()
            
        torch.save(model.state_dict(),(CHECKPOINT_DIR+"WD_epoch_{}_end").format(epoch))
train_model(model, train_dataset, valid_dataset, optimizer, loss_fn)

  0%|          | 0/174091 [00:00<?, ?it/s]

  0%|          | 0/58031 [00:00<?, ?it/s]

Epoch: 0, step: 0, train_loss: 0.6854370832443237, valid_loss: 0.6790282726287842


  0%|          | 0/58031 [00:00<?, ?it/s]

Epoch: 0, step: 75000, train_loss: 0.2519526183605194, valid_loss: 0.24311146140098572


  0%|          | 0/58031 [00:00<?, ?it/s]

Epoch: 0, step: 150000, train_loss: 0.23137380182743073, valid_loss: 0.23751363158226013


  0%|          | 0/174091 [00:00<?, ?it/s]

  0%|          | 0/58031 [00:00<?, ?it/s]

Epoch: 1, step: 0, train_loss: 0.12331333011388779, valid_loss: 0.23804187774658203


  0%|          | 0/58031 [00:00<?, ?it/s]

Epoch: 1, step: 75000, train_loss: 0.20077230036258698, valid_loss: 0.2376370131969452


  0%|          | 0/58031 [00:00<?, ?it/s]

Epoch: 1, step: 150000, train_loss: 0.19822804629802704, valid_loss: 0.23784269392490387


  0%|          | 0/174091 [00:00<?, ?it/s]

  0%|          | 0/58031 [00:00<?, ?it/s]

Epoch: 2, step: 0, train_loss: 0.18526583909988403, valid_loss: 0.24563662707805634


  0%|          | 0/58031 [00:00<?, ?it/s]

Epoch: 2, step: 75000, train_loss: 0.1908913105726242, valid_loss: 0.24513551592826843


  0%|          | 0/58031 [00:00<?, ?it/s]

Epoch: 2, step: 150000, train_loss: 0.18958505988121033, valid_loss: 0.2504119277000427
