In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('data.csv')
df['Place'] = df['Place'].astype(str)
df['Suffix'] = df['Suffix'].astype(str)
# print(len(df))
df = df[df['Suffix'] != 'nan']
# print(len(df))
X = [word.lower() for word in list(df['Place'])]
Y = [word.lower() for word in list(df['Suffix'])]
X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=0.2, random_state=42)



In [2]:
len(df)

967

In [3]:
# blocksize, embeddingsize, lr, epochs = 3, 4, 0.01, 1000
import torch.nn.functional as F
import torch
class embeddnetwork():

    def __init__(self,embeddingdim, blocksize, suffixcorpus):
        self.embeddingdim = embeddingdim
        self.blocksize = blocksize
        self.suffixtoi = {s:i for i, s in enumerate(set(suffixcorpus))}
        self.itosuffix = {i:s for s, i in self.suffixtoi.items()}
        self.characters = """abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890`~!@#$%^&*()_-+={[}]|\:;"'<,>.?/ """
        self.C = torch.randn((len(self.characters), self.embeddingdim))
        self.W1 = torch.randn((self.blocksize*self.embeddingdim, 100))
        self.b1 = torch.rand(100)
        self.W2 = torch.randn((100, len(self.suffixtoi)))
        self.b2 = torch.randn(len(self.suffixtoi))
        self.stoi = {s:i for i, s in enumerate(self.characters)}
        self.parameters = [self.C, self.W1, self.b1, self.W2, self.b2]
        for p in self.parameters:
            p.requires_grad = True
    
    def countrytoindx(self,country):
        context = []
        for char in country[-self.blocksize:]:
            ix = self.stoi[char]
            context.append(ix)
        if len(context) != self.blocksize:
            context = [self.stoi['_']]*(self.blocksize - len(context)) + context
        return context

    def fit(self,X, Y, lr, epochs):
        Xnew = []
        for country in X:
            context = self.countrytoindx(country)
            Xnew.append(context)
        Xnew = torch.tensor(Xnew)

        if self.suffixtoi == None:
            self.suffixtoi = {s:i for i, s in enumerate(set(Y))}
        else:
            for y in Y:
                if y not in self.suffixtoi:
                    self.suffixtoi[y] = len(self.suffixtoi)
        
        Ynew = []
        for suffix in Y:
            ix = self.suffixtoi[suffix]
            Ynew.append(ix)
        Ynew = torch.tensor(Ynew)

        for i in range(epochs):
            ix = torch.randint(0, Xnew.shape[0], (Xnew.shape[0],))
            emb = self.C[Xnew[ix]]
            h = torch.tanh(emb.view(-1, self.blocksize*self.embeddingdim)@self.W1 + self.b1)
            
            logits = h@self.W2 + self.b2

            loss = F.cross_entropy(logits, Ynew[ix])
            
            for p in self.parameters:
                p.grad = None
            loss.backward()
            for p in self.parameters:
                # print(len(p.grad))
                p.data -= lr*p.grad
            print('loss', loss.item())

            
    
    def predict(self,X):
        idxes = [self.countrytoindx(x) for x in X]
        idxes = torch.tensor(idxes)
        Ys = []
        for ix in idxes:
            emb = self.C[ix]
            h = torch.tanh(emb.view(-1, self.blocksize*self.embeddingdim)@self.W1 + self.b1)
            logits = h@self.W2 + self.b2
            counts = logits.exp()
            P = counts/counts.sum(1, keepdims=True)
            suffixix = torch.argmax(P).item()
            Ys.append(self.itosuffix[suffixix])
        return Ys



In [4]:
# import random

# # Define the range of hyperparameters to search over
# param_grid = {
#     'lr': [0.001, 0.01, 0.1],
#     'blocksize': [2, 4, 8],
#     'embeddingdim': [50, 100, 200],
#     'epochs': [500, 1000, 1500]
# }

# # Number of random configurations to try
# num_configs = 10

# best_accuracy = 0
# best_params = None

# for _ in range(num_configs):
#     # Randomly sample hyperparameters from the parameter grid
#     params = {
#         'lr': random.choice(param_grid['lr']),
#         'blocksize': random.choice(param_grid['blocksize']),
#         'embeddingdim': random.choice(param_grid['embeddingdim']),
#         'epochs': random.choice(param_grid['epochs'])
#     }

#     # print("Hyperparameters:", params)

#     # Create a new instance of the embeddnetwork with the sampled hyperparameters
#     network = embeddnetwork(params['embeddingdim'], params['blocksize'], Y_train)

#     # Train the network
#     network.fit(X_train, Y_train, params['lr'], params['epochs'])

#     # Evaluate the model (you can use your evaluation metric here)
#     # For example, you can use accuracy as the evaluation metric
#     predictions = network.predict(X_test)
#     accuracy = sum(1 for true, pred in zip(Y_test, predictions) if true == pred) / len(Y_test)
#     # print("Accuracy:", accuracy)

#     if accuracy > best_accuracy:
#         best_accuracy = accuracy
#         best_params = params

# # print("Best hyperparameters:", best_params)
# # print("Best accuracy:", best_accuracy)


In [5]:
embed = embeddnetwork(blocksize=4, embeddingdim=9, suffixcorpus = Y_train)
embed.fit(X_train, Y_train, lr=0.1, epochs=1050)

loss 19.983306884765625
loss 17.98912811279297
loss 16.45530891418457
loss 15.863031387329102
loss 15.073690414428711
loss 13.886831283569336
loss 13.339851379394531
loss 12.221077919006348
loss 12.258014678955078
loss 12.004724502563477
loss 11.380587577819824
loss 11.538366317749023
loss 10.513252258300781
loss 9.74443531036377
loss 9.756309509277344
loss 9.211165428161621
loss 9.780948638916016
loss 9.442468643188477
loss 8.819121360778809
loss 8.581266403198242
loss 9.424721717834473
loss 8.64840316772461
loss 8.262535095214844
loss 7.856505393981934
loss 7.769970417022705
loss 7.912354469299316
loss 7.64956521987915
loss 8.01409912109375
loss 7.150549411773682
loss 7.446536540985107
loss 7.2242841720581055
loss 6.870298385620117
loss 7.371103286743164
loss 7.188279628753662
loss 6.724444389343262
loss 6.35106897354126
loss 6.9265923500061035
loss 6.483570098876953
loss 6.587934970855713
loss 7.003581523895264
loss 6.398962020874023
loss 6.243101596832275
loss 5.94365119934082
loss

In [6]:
from sklearn.metrics import accuracy_score
Y_pred = embed.predict(X_test)
accuracy = accuracy_score(Y_test,Y_pred)
accuracy
# correct = 0
# for i, Y_pred in enumerate(embed.predict(X_test)):
#     if Y_pred == Y_test[i]:
#         correct += 1
# print(correct/len(Y_pred))

0.5773195876288659

In [7]:
embed.predict(['Pakistan'])

['i']