Goal: Find possible translation candidates for "The Great Gatsby" by Ernest Hemingway.
1. Merge Hathitrust dataset with true translations dataset
2. Translate each source into English using google translate API*
3. For each NLP method, take the top n-similarity results. We would expect a good NLP method to have low similarity
to the random dataset and high similarity to the true dataset
4. Proportion of results that intersect with actual translations dataset can be considered training accuracy

Other considerations:
* T/N weights don't need to be considered for the elementary similarity methods since no actual training is occuring
* Only issue is introducing too high dimensionality

In [148]:
import pandas as pd
import numpy as np
random_data = pd.read_csv("random_data.csv").drop(["Unnamed: 0"], axis = 1)

data = [["Velký Gatsby", "cze"], ["El gran Gatsby", "spa"], ["O Grande Gatsby", "por"],
        ["Veliki Gatsby", "hrv"], ["Den store Gatsby", "dan"], ["Suur Gatsby", "est"],
        ["The Great Gatsby - Kultahattu", "fin"], ["Didi getsbi", "geo"], ["Đại Gia Gatsby", "vie"]]
true_data = pd.DataFrame(data, columns = ["title","language"])

true_data_size = len(data)


In [13]:
random_data

Unnamed: 0,record number,title,language
0,7672958,Caravan days,eng
1,6496982,First steps in organizing playgrounds by Lee F...,eng
2,1940965,Un missionnaire catholique en Angleterre sous ...,fre
3,364121,"Our struggle for the fourteenth colony, Canada...",eng
4,9212680,Optimum distribution of material in sandwich p...,eng
...,...,...,...
105,1495944,The earth before history; man's origin and the...,eng
106,324233,"Unfairly won, a novel",eng
107,8643242,Fragments upon the balance of power in Europe,eng
108,7422530,When a patient speaks-- : patient representati...,eng


In [10]:
true_data

Unnamed: 0,title,language
0,Velký Gatsby,cze
1,El gran Gatsby,spa
2,O Grande Gatsby,por
3,Veliki Gatsby,hrv
4,Den store Gatsby,dan
5,Suur Gatsby,est
6,The Great Gatsby - Kultahattu,fin
7,Didi getsbi,geo
8,Đại Gia Gatsby,vie


In [59]:
# merge
full_data = random_data.merge(true_data, how = 'outer')
full_data["valid"] = full_data["record number"].apply(pd.isna)

## Translate to common language

In [52]:
# google translate uses 2-code so need to get associated language codec
full_data

Unnamed: 0,record number,title,language,valid
0,7672958.0,Caravan days,eng,False
1,6496982.0,First steps in organizing playgrounds by Lee F...,eng,False
2,1940965.0,Un missionnaire catholique en Angleterre sous ...,fre,False
3,364121.0,"Our struggle for the fourteenth colony, Canada...",eng,False
4,9212680.0,Optimum distribution of material in sandwich p...,eng,False
...,...,...,...,...
113,,Den store Gatsby,dan,True
114,,Suur Gatsby,est,True
115,,The Great Gatsby - Kultahattu,fin,True
116,,Didi getsbi,geo,True


In [56]:
l_code_3 = list(pd.read_csv("language-codes-3b2.csv")["alpha3-b"])
l_code_2 = list(pd.read_csv("language-codes-3b2.csv")["alpha2"])

In [60]:
full_data["language2"] = full_data["language".apply(lambda x: l_code_2[l_code_3.index(x)])

In [63]:
import translators as ts

def translate(title, lang_from):
    return ts.google(title, from_language = lang_from, to_language = 'en')
    

Using United States server backend.


In [70]:
full_data["translation"] = full_data.apply(lambda x: translate(x.title, x.language2), axis = 1)

In [74]:
full_data["translation"] = full_data["translation"].apply(lambda x: x.lower())

## Tokenize

In [83]:
import nltk
import pickle

full_data["tokens"] = full_data.apply(lambda x: nltk.word_tokenize(x.translation), axis = 1)


In [84]:
full_data

Unnamed: 0,record number,title,language,valid,language2,translation,tokens
0,7672958.0,Caravan days,eng,False,en,caravan days,"[caravan, days]"
1,6496982.0,First steps in organizing playgrounds by Lee F...,eng,False,en,first steps in organizing playgrounds by lee f...,"[first, steps, in, organizing, playgrounds, by..."
2,1940965.0,Un missionnaire catholique en Angleterre sous ...,fre,False,fr,a catholic missionary in england under the rei...,"[a, catholic, missionary, in, england, under, ..."
3,364121.0,"Our struggle for the fourteenth colony, Canada...",eng,False,en,"our struggle for the fourteenth colony, canada...","[our, struggle, for, the, fourteenth, colony, ..."
4,9212680.0,Optimum distribution of material in sandwich p...,eng,False,en,optimum distribution of material in sandwich p...,"[optimum, distribution, of, material, in, sand..."
...,...,...,...,...,...,...,...
113,,Den store Gatsby,dan,True,da,the great gatsby,"[the, great, gatsby]"
114,,Suur Gatsby,est,True,et,great gatsby,"[great, gatsby]"
115,,The Great Gatsby - Kultahattu,fin,True,fi,the great gatsby - gold hat,"[the, great, gatsby, -, gold, hat]"
116,,Didi getsbi,geo,True,ka,didi getsbi,"[didi, getsbi]"


# Model 1: Bag of Words

In [116]:
wordset = []

for i in full_data["tokens"]:
    wordset = np.union1d(wordset, i)

pickle.dump(wordset, open("wordset.pickle", "wb"))

In [115]:
def BoW(title):
    freq_dict = dict.fromkeys(wordset,0)
    for word in title:
        freq_dict[word] += 1
    return list(freq_dict.values())

In [158]:
ORIGINAL_BOW = BoW(["the","great","gatsby"])


def cosine_similarity(BoW):
    return np.dot(ORIGINAL_BOW, BoW)/(np.linalg.norm(ORIGINAL_BOW) * np.linalg.norm(BoW))

def euclidian_distance(BoW):
    return np.linalg.norm(np.array(ORIGINAL_BOW) - np.array(BoW))

def dot_product(BoW):
    return np.dot(ORIGINAL_BOW, BoW)

def jaccard_similarity(BoW):
    return np.count_nonzero(np.array(BoW) * np.array(ORIGINAL_BOW))/ np.count_nonzero(np.array(BoW) + np.array(ORIGINAL_BOW))

In [159]:
full_data["cosine similarity"] = full_data.apply(lambda x: cosine_similarity(BoW(x.tokens)), axis = 1)
full_data["euclidian distance"] = full_data.apply(lambda x: euclidian_distance(BoW(x.tokens)), axis = 1)
full_data["dot product"] = full_data.apply(lambda x: dot_product(BoW(x.tokens)), axis = 1)
full_data["jaccard similarity"] = full_data.apply(lambda x: jaccard_similarity(BoW(x.tokens)), axis = 1)

In [160]:
full_data

Unnamed: 0,record number,title,language,valid,language2,translation,tokens,cosine similarity,euclidian distance,dot product,jaccard similarity
0,7672958.0,Caravan days,eng,False,en,caravan days,"[caravan, days]",0.000000,2.236068,0,0.000000
1,6496982.0,First steps in organizing playgrounds by Lee F...,eng,False,en,first steps in organizing playgrounds by lee f...,"[first, steps, in, organizing, playgrounds, by...",0.000000,3.464102,0,0.000000
2,1940965.0,Un missionnaire catholique en Angleterre sous ...,fre,False,fr,a catholic missionary in england under the rei...,"[a, catholic, missionary, in, england, under, ...",0.182574,3.316625,1,0.083333
3,364121.0,"Our struggle for the fourteenth colony, Canada...",eng,False,en,"our struggle for the fourteenth colony, canada...","[our, struggle, for, the, fourteenth, colony, ...",0.308607,3.605551,2,0.076923
4,9212680.0,Optimum distribution of material in sandwich p...,eng,False,en,optimum distribution of material in sandwich p...,"[optimum, distribution, of, material, in, sand...",0.000000,5.291503,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
113,,Den store Gatsby,dan,True,da,the great gatsby,"[the, great, gatsby]",1.000000,0.000000,3,1.000000
114,,Suur Gatsby,est,True,et,great gatsby,"[great, gatsby]",0.816497,1.000000,2,0.666667
115,,The Great Gatsby - Kultahattu,fin,True,fi,the great gatsby - gold hat,"[the, great, gatsby, -, gold, hat]",0.707107,1.732051,3,0.500000
116,,Didi getsbi,geo,True,ka,didi getsbi,"[didi, getsbi]",0.000000,2.236068,0,0.000000


In [155]:
pickle.dump(full_data, open("bow_matrix.pickle", "wb"))

### Evaluating Performance

In [161]:
# percentage of true translations each method found

# cosine similarity accuracy
print(
full_data.sort_values("cosine similarity", ascending = False).iloc[:true_data_size]["valid"].mean()
)

# euclidian distance accuracy
print(
full_data.sort_values("euclidian distance", ascending = True).iloc[:true_data_size]["valid"].mean()
)

# dot product similarity accuracy
print(
full_data.sort_values("dot product", ascending = False).iloc[:true_data_size]["valid"].mean()
)

# jaccard similarity accuracy
print(
full_data.sort_values("jaccard similarity", ascending = False).iloc[:true_data_size]["valid"].mean()
)

0.7777777777777778
0.8888888888888888
0.2222222222222222
0.8888888888888888
