In [None]:
%%bash
wget -qN http://www.lexique.org/listes/liste_mots.txt
wget -qN https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt

(cut -f1 liste_mots.txt | grep "[a-z]" | grep -v "[_ ]" | awk '{print "1",$0}' | shuf | head -10000;
cat words_alpha.txt | grep "[a-z]" | grep -v "[_ ]" | awk '{print "0",$0}' | shuf | head -10000) \
    | iconv -f utf8 -t ascii//TRANSLIT | shuf > words.txt
head words.txt

In [None]:
words = []
labels = []
with open('words.txt') as fp:
    for line in fp:
        label, word = line.strip().split()
        words.append(word)
        labels.append(int(label))

print(words[:10])
print(labels[:10])

In [None]:
import collections
vocab = collections.defaultdict(lambda: len(vocab))

features = []
for word in words:
    text_features = list(word)
    features.append([vocab[ngram] for ngram in text_features])

print(features[:10])
print(len(vocab))

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

X = torch.zeros(len(words), len(vocab))

for i, x in enumerate(features):
    for feature in x:
        X[i, feature] += 1

import random
random.shuffle(labels)
Y = torch.LongTensor(labels)
print(words[33], X[33])

In [None]:
from torch.utils.data import TensorDataset, DataLoader
train_set = TensorDataset(X, Y)
train_loader = DataLoader(train_set, batch_size=4, shuffle=True)

In [None]:
class LinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(len(vocab), 2)
    def forward(self, x):
        return self.l1(x)

model = LinearModel()
model

In [None]:
def fit(model, epochs):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())
    for epoch in range(epochs):
        total_loss = 0
        num = 0
        for x, y in train_loader:
            optimizer.zero_grad()
            y_scores = model(Variable(x))
            loss = criterion(y_scores, Variable(y))
            loss.backward()
            optimizer.step()
            total_loss += loss.data[0]
            num += len(y)
        print(epoch, total_loss / num)
fit(model, 10)

In [None]:
def predict(word):
    word = '^%s$' % word
    x = torch.zeros(1, len(vocab))
    for feature in list(word):
        if feature in vocab:
            x[0, vocab[feature]] += 1
    y_scores = model(Variable(x))
    y_pred = torch.max(y_scores, 1)[1]
    return y_pred.data[0]

print(predict('coming'))
print(predict('venu'))