In [1]:
import torch
import numpy as np
import torchvision
import pandas as pd
from tqdm import tqdm

In [2]:
# Длина максимального слова (для нормализации), подробнее в 'Testing.ipynb'
MAX_WORD = 57

CONSONANTS = 'бвгджзйклмнпрстфхцчшщБВГДЖЗЙКЛМНПРСТФХЦЧШЩbcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXZ'
VOWELS = 'аиоуэыеёюяАИОУЭЫЕЁЮЯaeiouyAEIOUY'
ALPHABET = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюяaeiouyAEIOUYbcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXZ'

In [3]:
train_companies = open('../kontur_srs_internship_test_task/train.txt', 'r').readlines()
train_companies = list(map(lambda s: s.strip(), train_companies))

test_companies = open('../kontur_srs_internship_test_task/test.txt', 'r').readlines()
test_companies = list(map(lambda s: s.strip(), test_companies))

In [4]:
del train_companies[471970]

In [5]:
import re
               
def tokenize(company, spaces=False):
    with_spaces = list(filter(None, re.split('(\w+| )', company)))
    if(spaces):
        return with_spaces
    return [token for token in with_spaces if token != ' ']

def has_letters(string):
    return any(char in ALPHABET for char in string)

def has_not_letters(string):
    return any(char not in ALPHABET for char in string)

def count_vowels(string):
    return len([char for char in string if char in VOWELS])

def count_consonants(string):
    return len([char for char in string if char in CONSONANTS])

In [6]:
def extract_features(company):
    
    # Длина слова, количество гласных/согласных, положение в слове,
    # есть ли в слове что-то помимо букв и то же самое для предыдущего
    # и последующего слова
    num_of_features = 11 + MAX_WORD
    length = num_of_features * 3
    
    # 3 класса - полностью писменными, первая заглавная или полностью заглавными
    num_of_classes = 3

    size = 0
    for word in tokenize(company):
        if(has_letters(word)):
            size += 1
            
    features = np.zeros((size, length))
    labels = np.zeros((size, MAX_WORD))
    
    index = 0
    tokens = tokenize(company, spaces=True)
    for i, word in enumerate(tokens):
        if(has_letters(word)):
            if(index == 0):
                features[index][num_of_features] = index/size
                features[index][num_of_features + 1] = len(word)/MAX_WORD
                features[index][num_of_features + 2] = count_consonants(word)/len(word)
                features[index][num_of_features + 3] = count_vowels(word)/len(word)
                features[index][num_of_features + 4] = int(has_not_letters(word))
                if(i > 0):
                    features[index][num_of_features + 5] = int(tokens[i - 1] == '"')
                    features[index][num_of_features + 6] = int(tokens[i - 1] == '(')
                    features[index][num_of_features + 7] = int(tokens[i - 1] == ' ')
                if( i + 1 < len(tokens)):
                    features[index][num_of_features + 8] = int(tokens[i + 1] == ')')
                    features[index][num_of_features + 9] = int(tokens[i + 1] == '"')
                    features[index][num_of_features + 10] = int(tokens[i + 1] == ' ')
                    
                for j, char in enumerate(list(word)):
                    features[index][num_of_features + 11 + j] = encoding[char.upper()]
                    labels[index][j] = int(char.isupper())
                
            else:
                features[index - 1][2 * num_of_features] = index/size
                features[index - 1][2 * num_of_features + 1] = len(word)/MAX_WORD
                features[index - 1][2 * num_of_features + 2] = count_consonants(word)/len(word)
                features[index - 1][2 * num_of_features + 3] = count_vowels(word)/len(word)
                features[index - 1][2 * num_of_features + 4] = int(has_not_letters(word))
                features[index - 1][2 * num_of_features + 5] = int(tokens[i - 1] == '"')
                features[index - 1][2 * num_of_features + 6] = int(tokens[i - 1] == '(')
                features[index - 1][2 * num_of_features + 7] = int(tokens[i - 1] == ' ')
                if( i + 1 < len(tokens)):
                    features[index - 1][2 * num_of_features + 8] = int(tokens[i + 1] == ')')
                    features[index - 1][2 * num_of_features + 9] = int(tokens[i + 1] == '"')
                    features[index - 1][2 * num_of_features + 10] = int(tokens[i + 1] == ' ')
                
                for j, char in enumerate(list(word)):
                    features[index - 1][2 * num_of_features + 11 + j] = encoding[char.upper()]
                    labels[index][j] = int(char.isupper())
                    
                features[index][:2 * num_of_features] = features[index - 1][num_of_features:]
            
            index += 1
            
    return list(features), list(labels)

In [7]:
from collections import Counter
counter = Counter({'О': 35312904, ' ': 25422815, 'Н': 24537989, 'Е': 23969940, 'Т': 21420192, 'С': 19892386, 'А': 16809787, 'И': 14950389, 'Р': 13874547, 'В': 13280775, '"': 9695019, 'К': 7858626, 'Г': 5914467, 'Л': 5851872, 'Б': 5428314, 'Й': 5352151, '\n': 5232476, 'Ь': 4866345, 'П': 4642709, 'Д': 4606840, 'М': 4369257, 'Ч': 3898716, 'Я': 3613597, 'У': 3592997, 'Щ': 3302510, 'Ю': 3018424, 'З': 2625035, 'Ц': 2040518, 'Ы': 1500303, '-': 1443125, 'Ж': 1273203, 'Ф': 1136416, 'Х': 1122008, 'Ш': 674331, 'Э': 502713, '.': 406987, '№': 214440, '1': 194707, '2': 167236, '(': 124233, ')': 124117, '0': 102588, ',': 95619, '3': 92674, '4': 73813, '5': 68067, '7': 58772, '9': 58326, '6': 56049, 'Ъ': 55448, '8': 47595, 'Ё': 29626, 'I': 29581, '/': 28983, '+': 25175, 'X': 13675, "'": 12332, '`': 8071, 'N': 7230, 'T': 5980, 'V': 5571, 'O': 5074, 'A': 5020, 'E': 4842, 'R': 4648, 'L': 4577, 'S': 4524, 'C': 4156, 'P': 3837, 'D': 3460, 'M': 2589, ':': 1590, '!': 1578, '&': 1530, 'U': 1491, 'G': 1320, '<': 1152, 'H': 1102, '>': 1069, 'F': 1058, 'Y': 1054, 'B': 1005, 'K': 927, ';': 892, '\\': 634, 'W': 541, '°': 509, '«': 427, 'Z': 379, '»': 378, '*': 374, '%': 237, '_': 237, 'J': 226, '?': 120, 'Q': 58, '=': 48, '@': 45, '–': 41, '^': 29, '{': 24, '}': 23, '[': 16, '~': 16, ']': 14, '$': 12, '#': 12, '‹': 10, '›': 10, '·': 8, '|': 7, '“': 6, '”': 6, '©': 3, '—': 2, '§': 2, '’': 2, '™': 2, '®': 1, 'Є': 1, '\xa0': 1, '±': 1, '…': 1, '¶': 1})
print(len(counter))
encoding = {}
i = 0
for char in counter:
    encoding[char] = i/len(counter)
    i += 1

124


In [8]:
def generate_dataset(companies, test=False):
    features = np.array([])
    labels = np.array([])
    for company in companies:
        curr_features, curr_labels = extract_features(company)
        if(len(features)):
            features += curr_features
            labels += curr_labels
        else:
            features = curr_features
            labels = curr_labels
            
    return features, labels

In [9]:
def transform(company, labels):
    word_index = 0
    corrected = tokenize(company, spaces=True)
    for i, word in enumerate(corrected):
        if (has_letters(word)):
            if (labels[word_index] == 0):
                corrected[i] = word.lower()
            elif(labels[word_index] == 1):
                corrected[i] = word[0] + word[1:].lower()
            word_index += 1
    return ''.join(corrected)

In [10]:
from torch.utils.data import Dataset

class WordDataset(Dataset):

    def __init__(self, data):

        self.X, self.y = generate_dataset(data)
        self.X = torch.tensor(self.X)
        self.y = torch.tensor(self.y)
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):

        return self.X[idx], self.y[idx]

In [11]:
class MixedWordsDataset(Dataset):

    def __init__(self, data):

        
        mixed_words = []
        for company in data:
            for word in tokenize(company):
                if not word[1:].islower() and not word[1:].isupper() and len(word) > 5:
                    mixed_words.append(word)
        
        features = np.zeros((len(mixed_words), MAX_WORD))
        labels = np.zeros((len(mixed_words), MAX_WORD))
        for i in range(len(mixed_words)):
            for j in range(len(mixed_words[i])):
                features[i][j] = encoding[mixed_words[i][j].upper()]
                labels[i][j] = int(mixed_words[i][j].isupper())
        self.X = torch.tensor(features)
        self.y = torch.tensor(labels)
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):

        return self.X[idx], self.y[idx]

In [12]:
train_dataset = MixedWordsDataset(train_companies[:100000])
val_dataset = MixedWordsDataset(train_companies[100000:120000])

In [13]:
batch_size = 100

In [14]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=batch_size)
val_dataloader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, num_workers=batch_size)

In [33]:
def train_model(model, loss, optimizer, scheduler, num_epochs):
    for epoch in range(num_epochs):
        print('Epoch {}/{}:'.format(epoch, num_epochs - 1), flush=True)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                dataloader = train_dataloader
                model.train()  # Set model to training mode
            else:
                dataloader = val_dataloader
                model.eval()   # Set model to evaluate mode

            running_loss = 0.

            # Iterate over data.
            for inputs, labels in tqdm(dataloader):
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                # forward and backward
                with torch.set_grad_enabled(phase == 'train'):
                    preds = model(inputs.float())
                    loss_value = loss(preds, labels.float())
                    preds_class = preds.argmax(dim=1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss_value.backward()
                        optimizer.step()
                        scheduler.step()

                # statistics
                running_loss += loss_value.item()

            epoch_loss = running_loss / len(dataloader)

            print('{} Loss: {:.4f}'.format(phase, epoch_loss), flush=True)

    return model

In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class WordNet(nn.Module):

    def __init__(self, num_of_features):
        
        super(WordNet, self).__init__()
        self.fc1 = nn.Linear(num_of_features, 200) 
        self.fc2 = nn.Linear(200, 120)
        self.fc3 = nn.Linear(120, 84)
        self.fc4 = nn.Linear(84, MAX_WORD)

    def forward(self, x):
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        return x

In [79]:
model = WordNet(MAX_WORD)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

loss = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3*1.0e-3)

# Decay LR by a factor of 0.1 every 7 epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [80]:
train_model(model, loss, optimizer, scheduler, num_epochs=10)

Epoch 0/9:


100%|██████████| 15/15 [00:04<00:00,  3.15it/s]

train Loss: 0.6310



100%|██████████| 3/3 [00:04<00:00,  1.51s/it]

val Loss: 0.5848
Epoch 1/9:



100%|██████████| 15/15 [00:04<00:00,  3.32it/s]

train Loss: 0.5810



100%|██████████| 3/3 [00:04<00:00,  1.47s/it]

val Loss: 0.5823
Epoch 2/9:



100%|██████████| 15/15 [00:04<00:00,  3.50it/s]

train Loss: 0.5802



100%|██████████| 3/3 [00:04<00:00,  1.40s/it]

val Loss: 0.5822
Epoch 3/9:



100%|██████████| 15/15 [00:03<00:00,  4.19it/s]


KeyboardInterrupt: 