In [1]:
# vocabulary : transfor input to number data
# vectorizer: set data as vector
# Dataset: process data's vectorizer
# Model: torch model

In [2]:
import os
from argparse import Namespace
import collections
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import torch
import pprint

In [3]:
def set_seeds(seed,cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda :
        torch.cuda.manual_seed_all(seed)

In [4]:
def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [5]:
args = Namespace(
    seed = 999,
    cuda = False,
    shuffle = True,
    data_file = "names.csv",
    split_data_file = "split_names.csv",
    vectorizer_file = "vectorizer.json",
    model_state_file = "model.pth",
    save_dir = "names",
    train_size = 0.7,
    val_size = 0.15,
    test_size = 0.15,
    num_epochs = 20,
    early_stopping_criteria = 5,
    learning_rate = 1e-3,
    batch_size = 64,
    hidden_dim = 300,
    dropout_p = 0.1
)

print(" args is :")
pprint.pprint(vars(args),indent=4)

 args is :
{   'batch_size': 64,
    'cuda': False,
    'data_file': 'names.csv',
    'dropout_p': 0.1,
    'early_stopping_criteria': 5,
    'hidden_dim': 300,
    'learning_rate': 0.001,
    'model_state_file': 'model.pth',
    'num_epochs': 20,
    'save_dir': 'names',
    'seed': 999,
    'shuffle': True,
    'split_data_file': 'split_names.csv',
    'test_size': 0.15,
    'train_size': 0.7,
    'val_size': 0.15,
    'vectorizer_file': 'vectorizer.json'}


In [6]:
set_seeds(args.seed,args.cuda)
create_dir(args.save_dir)
args.vectorizer_file = os.path.join(args.save_dir,args.vectorizer_file)
args.model_state_file = os.path.join(args.save_dir,args.model_state_file)

In [7]:
if torch.cuda.is_available() :
    args.cuda = True
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

Using CUDA: True


In [8]:
# process data 
import re
import urllib

In [9]:
url = "https://raw.githubusercontent.com/LisonEvf/practicalAI-cn/master/data/surnames.csv"
response = urllib.request.urlopen(url)
html = response.read()
with open(args.data_file, 'wb') as fp:
    fp.write(html)

KeyboardInterrupt: 

In [10]:
df = pd.read_csv(args.data_file,header=0)
df.head(10)

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian
5,Obinata,Japanese
6,Rahal,Arabic
7,Zhuan,Chinese
8,Acconci,Italian
9,Mifsud,Arabic


In [11]:
by_nation = collections.defaultdict(list)
for _,row in df.iterrows():
    by_nation[row.nationality].append(row.to_dict())
for nationality in by_nation:
    print("{0} : {1}".format(nationality,len(by_nation[nationality])))

English : 2972
French : 229
Arabic : 1603
Russian : 2373
Japanese : 775
Chinese : 220
Italian : 600
Czech : 414
Irish : 183
German : 576
Greek : 156
Spanish : 258
Polish : 120
Dutch : 236
Vietnamese : 58
Korean : 77
Portuguese : 55
Scottish : 75


In [12]:
i = 0
for index, row in df.iterrows():
    print("----------")
    print("Index:", index)
    print("Row data:\n", row)
    print("\n")

    i+=1
    if i>2:
        break

----------
Index: 0
Row data:
 surname        Woodford
nationality     English
Name: 0, dtype: object


----------
Index: 1
Row data:
 surname          Coté
nationality    French
Name: 1, dtype: object


----------
Index: 2
Row data:
 surname           Kore
nationality    English
Name: 2, dtype: object




In [13]:
final_list = []
for _,item_list in sorted(by_nation.items()):
    if args.shuffle:
        np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_size * n)
    n_val = int(args.val_size * n)
    n_test = int(args.test_size * n)
    for i in range(0,len(item_list)):
        item = item_list[i]
        if i< n_train:
            item['split'] = 'train'
        elif n_train<= i < (n_train + n_val):
            item['split'] = 'val'
        else:
            item['split'] = 'test'
    final_list.extend(item_list)

split_df = pd.DataFrame(final_list)
split_df['split'].value_counts()

split
train    7680
test     1660
val      1640
Name: count, dtype: int64

In [14]:
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ",text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
split_df.surname = split_df.surname.apply(preprocess_text)


In [15]:
split_df.to_csv(args.split_data_file,index=False)
split_df.head(5)

Unnamed: 0,surname,nationality,split
0,essa,Arabic,train
1,mustafa,Arabic,train
2,nazari,Arabic,train
3,shadid,Arabic,train
4,assaf,Arabic,train


In [22]:
class Vocabulary(object):
    def __init__(self, token_to_idx=None, add_unk=True, unk_token=""):

        # 令牌（Token）到索引（index）
        if token_to_idx is None:
            token_to_idx = {}
        self.token_to_idx = token_to_idx

        # 索引（Index）到令牌（token）
        self.idx_to_token = {idx: token \
                             for token, idx in self.token_to_idx.items()}
        
        # 添加未知 token
        self.add_unk = add_unk
        self.unk_token = unk_token
        if self.add_unk:
            self.unk_index = self.add_token(self.unk_token)

    def to_serializable(self):
        return {'token_to_idx': self.token_to_idx,
                'add_unk': self.add_unk, 'unk_token': self.unk_token}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index

    def add_tokens(self, tokens):
        return [self.add_token[token] for token in tokens]

    def lookup_token(self, token):
        if self.add_unk:
            index = self.token_to_idx.get(token, self.unk_index)
        else:
            index =  self.token_to_idx[token]
        return index

    def lookup_index(self, index):
        if index not in self.idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self.idx_to_token[index]

    def __str__(self):
        return str(len(self))

    def __len__(self):
        return len(self.token_to_idx)

In [23]:
nationality_vocab = Vocabulary(add_unk=False)
for index, row in df.iterrows():
    nationality_vocab.add_token(row.nationality)
print (nationality_vocab) # __str__
print (len(nationality_vocab)) # __len__
index = nationality_vocab.lookup_token("English")
print (index)
print (nationality_vocab.lookup_index(index))

18
18
0
English


In [24]:
class SurnameVectorizer(object):
    def __init__(self, surname_vocab, nationality_vocab):
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab

    def vectorize(self, surname):
        one_hot = np.zeros(len(self.surname_vocab), dtype=np.float32)
        for token in surname:
            one_hot[self.surname_vocab.lookup_token(token)] = 1
        return one_hot

    def unvectorize(self, one_hot):
        surname = [vectorizer.surname_vocab.lookup_index(index) \
            for index in np.where(one_hot==1)[0]]
        return surname
        
    @classmethod
    def from_dataframe(cls, df):
        surname_vocab = Vocabulary(add_unk=True)
        nationality_vocab = Vocabulary(add_unk=False)

        # 创建 vocabularies
        for index, row in df.iterrows():
            for letter in row.surname: # char-level tokenization
                surname_vocab.add_token(letter)
            nationality_vocab.add_token(row.nationality)
        return cls(surname_vocab, nationality_vocab)

    @classmethod
    def from_serializable(cls, contents):
        surname_vocab = Vocabulary.from_serializable(contents['surname_vocab'])
        nationality_vocab =  Vocabulary.from_serializable(contents['nationality_vocab'])
        return cls(surname_vocab, nationality_vocab)

    def to_serializable(self):
        return {'surname_vocab': self.surname_vocab.to_serializable(),
                'nationality_vocab': self.nationality_vocab.to_serializable()}

In [25]:
vectorizer = SurnameVectorizer.from_dataframe(split_df)
print (vectorizer.surname_vocab)
print (vectorizer.nationality_vocab)
one_hot = vectorizer.vectorize(preprocess_text("goku"))
print (one_hot)
print (vectorizer.unvectorize(one_hot))

28
18
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0.]
['u', 'k', 'o', 'g']


In [26]:
from torch.utils.data import Dataset, DataLoader


In [31]:
class SurnameDataset(Dataset):
    def __init__(self, df, vectorizer):
        self.df = df
        self.vectorizer = vectorizer

        # Data splits
        self.train_df = self.df[self.df.split=='train']
        self.train_size = len(self.train_df)
        self.val_df = self.df[self.df.split=='val']
        self.val_size = len(self.val_df)
        self.test_df = self.df[self.df.split=='test']
        self.test_size = len(self.test_df)
        self.lookup_dict = {'train': (self.train_df, self.train_size), 
                            'val': (self.val_df, self.val_size),
                            'test': (self.test_df, self.test_size)}
        self.set_split('train')

        # Class weights (for imbalances)
        class_counts = df.nationality.value_counts().to_dict()
        def sort_key(item):
            return self.vectorizer.nationality_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)

    @classmethod
    def load_dataset_and_make_vectorizer(cls, split_data_file):
        df = pd.read_csv(split_data_file, header=0)
        train_df = df[df.split=='train']
        return cls(df, SurnameVectorizer.from_dataframe(train_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, split_data_file, vectorizer_filepath):
        df = pd.read_csv(split_data_file, header=0)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(df, vectorizer)

    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return SurnameVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self.vectorizer.to_serializable(), fp)

    def set_split(self, split="train"):
        self.target_split = split
        self.target_df, self.target_size = self.lookup_dict[split]

    def __str__(self):
        return "<Dataset(split={0}, size={1})".format(
            self.target_split, self.target_size)

    def __len__(self):
        return self.target_size

    def __getitem__(self, index):
        row = self.target_df.iloc[index]
        surname_vector = self.vectorizer.vectorize(row.surname)
        nationality_index = self.vectorizer.nationality_vocab.lookup_token(row.nationality)
        return {'surname': surname_vector, 'nationality': nationality_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

    def generate_batches(self, batch_size, shuffle=True, drop_last=True, device="cpu"):
        dataloader = DataLoader(dataset=self, batch_size=batch_size, 
                                shuffle=shuffle, drop_last=drop_last)
        for data_dict in dataloader:
            out_data_dict = {}
            for name, tensor in data_dict.items():
                out_data_dict[name] = data_dict[name].to(device)
            yield out_data_dict

In [32]:
# Dataset instance
dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.split_data_file)
print (dataset) # __str__
print (dataset[5]) # __getitem__
print (dataset.class_weights)

<Dataset(split=train, size=7680)
{'surname': array([0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32), 'nationality': 0}
tensor([0.0006, 0.0045, 0.0024, 0.0042, 0.0003, 0.0044, 0.0017, 0.0064, 0.0055,
        0.0017, 0.0013, 0.0130, 0.0083, 0.0182, 0.0004, 0.0133, 0.0039, 0.0172])


In [36]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [34]:
class SurnameModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_p):
        super(SurnameModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x_in, apply_softmax=False):
        z = F.relu(self.fc1(x_in))
        z = self.dropout(z)
        y_pred = self.fc2(z)

        if apply_softmax:
            y_pred = F.softmax(y_pred, dim=1)
        return y_pred