This file creates the list of migrant words. That is, the list of words that appeared in at least two of the languages.

In [2]:
import pandas as pd
import numpy as np
import os
import tqdm 

We set the start year for considering the first appearance of a word in any of the languages and take the unique words of each language from that year till the last year.

In [4]:
start_year = 1800

unique_english = pd.read_csv("Top5000/top_english_exc_all.csv",dtype="str",header=None,
                       names = ["word","year",'count'])[(start_year-1740)*5000:].iloc[:,0].unique()
unique_spanish = pd.read_csv("Top5000/top_spanish_exc_all.csv",dtype="str",header=None,
                       names = ["word","year",'count'])[(start_year-1740)*5000:].iloc[:,0].unique()
unique_italian =  pd.read_csv("Top5000/top_italian_exc_all.csv",dtype="str",header=None,
                       names = ["word","year",'count'])[(start_year-1740)*5000:].iloc[:,0].unique()
unique_german =  pd.read_csv("Top5000/top_german_exc_all.csv",dtype="str",header=None,
                       names = ["word","year",'count'])[(start_year-1740)*5000:].iloc[:,0].unique()
unique_french =  pd.read_csv("Top5000/top_french_exc_all.csv",dtype="str",header=None,
                       names = ["word","year",'count'])[(start_year-1740)*5000:].iloc[:,0].unique()
unique_english

array(['great', 'time', 'king', ..., 'folks', 'sighed', 'leaning'],
      dtype=object)

In [5]:
# We put together all the unique words
all_words = np.concatenate((unique_english,unique_spanish,unique_italian
                            ,unique_german,unique_french ))

# And select the words that appear in at least 2 languages 
element_counts = pd.value_counts(all_words)
migrants = element_counts[element_counts.values >= 2].index
migrants

Index(['voltaire', 'michael', 'pierre', 'ct', 'max', 'pitt', 'henry',
       'roosevelt', 'pa', 'william',
       ...
       'serve', 'learning', 'nice', 'voyage', 'treaty', 'incident', 'excuse',
       'babylon', 'rest', 'complexion'],
      dtype='object', length=6058)

In [6]:
df_english = pd.read_csv("Top5000/top_english_exc_all.csv",dtype={"0":str, "1":np.int32, "2":np.int32},
                         names = ["word","year","count"], header=None,index_col=False)[(start_year-1740)*5000:]
df_spanish = pd.read_csv("Top5000/top_spanish_exc_all.csv",dtype={"0":str, "1":np.int32, "2":np.int32},
                         names = ["word","year","count"], header=None,index_col=False)[(start_year-1740)*5000:]
df_italian = pd.read_csv("Top5000/top_italian_exc_all.csv",dtype={"0":str, "1":np.int32, "2":np.int32},
                         names = ["word","year","count"], header=None,index_col=False)[(start_year-1740)*5000:]
df_german  = pd.read_csv("Top5000/top_german_exc_all.csv",dtype={"0":str, "1":np.int32, "2":np.int32},
                         names = ["word","year","count"], header=None,index_col=False)[(start_year-1740)*5000:]
df_french  = pd.read_csv("Top5000/top_french_exc_all.csv",dtype={"0":str, "1":np.int32, "2":np.int32},
                         names = ["word","year","count"], header=None,index_col=False)[(start_year-1740)*5000:]



In [5]:
df_english['Rank'] = df_english.groupby("year")["count"].rank(ascending=False, method='first')
df_english = df_english.sort_values(by=["year", "Rank"])

df_spanish['Rank'] = df_spanish.groupby("year")["count"].rank(ascending=False, method='first')
df_spanish = df_spanish.sort_values(by=["year", "Rank"])

df_italian['Rank'] = df_italian.groupby("year")["count"].rank(ascending=False, method='first')
df_italian = df_italian.sort_values(by=["year", "Rank"])

df_german['Rank'] = df_german.groupby("year")["count"].rank(ascending=False, method='first')
df_german = df_german.sort_values(by=["year", "Rank"])

df_french['Rank'] = df_french.groupby("year")["count"].rank(ascending=False, method='first')
df_french = df_french.sort_values(by=["year", "Rank"])



In [6]:
df_french

Unnamed: 0,word,year,count,Rank
300000,point,1800,257124,1.0
300001,faire,1800,245328,2.0
300002,eau,1800,167858,3.0
300003,partie,1800,163174,4.0
300004,corps,1800,161718,5.0
...,...,...,...,...
1349995,implications,2009,21691,4996.0
1349996,trajectoire,2009,21691,4997.0
1349997,lot,2009,21684,4998.0
1349998,républicains,2009,21678,4999.0


In [7]:
df_french[df_french["year"]==1801]

Unnamed: 0,word,year,count,Rank
305000,faire,1801,370542,1.0
305001,point,1801,365589,2.0
305002,fut,1801,226358,3.0
305003,roi,1801,212330,4.0
305004,état,1801,207680,5.0
...,...,...,...,...
309995,miroir,1801,3348,4996.0
309996,remonte,1801,3346,4997.0
309997,épaisses,1801,3344,4998.0
309998,mêlés,1801,3344,4999.0


In [18]:
# We will create the migrant data frame, with the words that are in at least two languages

migrant_df = pd.DataFrame(columns=["word", "year", "count", "language"])

for word in tqdm.tqdm(migrants):
    if word in unique_english:
        row = df_english[df_english.iloc[:, 0] == word].iloc[0].to_dict()
        row["language"] = "english"
        migrant_df = migrant_df._append(row, ignore_index=True)
    
    if word in unique_spanish:
        row = df_spanish[df_spanish.iloc[:, 0] == word].iloc[0].to_dict()
        row["language"] = "spanish"
        migrant_df = migrant_df._append(row, ignore_index=True)
    
    if word in unique_italian:
        row = df_italian[df_italian.iloc[:, 0] == word].iloc[0].to_dict()
        row["language"] = "italian"
        migrant_df = migrant_df._append(row, ignore_index=True)
    
    if word in unique_german:
        row = df_german[df_german.iloc[:, 0] == word].iloc[0].to_dict()
        row["language"] = "german"
        migrant_df = migrant_df._append(row, ignore_index=True)
    
    if word in unique_french:
        row = df_french[df_french.iloc[:, 0] == word].iloc[0].to_dict()
        row["language"] = "french"
        migrant_df = migrant_df._append(row, ignore_index=True)

100%|██████████████████████████████████████████████████████████████████████████████| 4999/4999 [10:41<00:00,  7.79it/s]


migrant_df has all the words that appeared in at least two languages, along with the first year of appearance, count and rank in each language.

In [19]:
migrant_df

Unnamed: 0,word,year,count,language,Rank
0,george,1820,51423,english,455.0
1,george,1918,4984,spanish,4743.0
2,george,1918,2028,italian,4028.0
3,george,1916,5122,german,4680.0
4,george,1828,8038,french,4899.0
...,...,...,...,...,...
11839,dict,1824,29993,french,1420.0
11840,decis,1828,1874,spanish,1913.0
11841,decis,1822,2157,italian,2308.0
11842,abd,1822,3908,italian,1192.0


In [20]:
migrant_df = migrant_df.sort_values(by=["word","year","Rank"],ascending=True)
migrant_df

Unnamed: 0,word,year,count,language,Rank
3002,aa,1820,798,spanish,4548.0
3003,aa,1821,878,italian,4650.0
3004,aa,1942,3169,german,4923.0
11007,abandon,1820,8026,french,3584.0
11006,abandon,1820,8672,english,3616.0
...,...,...,...,...,...
7630,élite,1991,34612,spanish,4752.0
9409,état,1820,317680,french,10.0
9408,état,1865,4865,italian,2681.0
9967,états,1820,78298,french,227.0


In [22]:
migrant_df.to_csv("Migrant/migrant_1800.csv")

# Reading the results

Here we read the resulting file of migrant words

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
migrant = pd.read_csv("Migrant/migrant_1800.csv")
migrant.drop(migrant.columns[0],inplace=True,axis=1)

For example, this shows the result for the word "plasma", which first appeared in the top 5000 words of german in the year 1886 (with a rank of 4796). Then it appeared in italian in 1942, then english and then french.

In [9]:
migrant[migrant["word"]=="plasma"]

Unnamed: 0,word,year,count,language,Rank
10008,plasma,1886,4850,german,4796.0
10009,plasma,1942,3588,italian,3834.0
10010,plasma,1951,46242,english,4708.0
10011,plasma,1955,9394,french,4846.0


The "migrants" function let's us find all the migrant words that originated from language_out and got into "language_in" inside a set of years

In [12]:
def migrants(language_out,language_in,years):
    #Take only the first appearances of words
    first_appearance = migrant.drop_duplicates(subset='word', keep='first')
    
    #take the words created by the out language.
    out_words = first_appearance[first_appearance["language"] == language_out ]
    
    #Take the words that
    migrated_from_out = migrant[(migrant["word"].isin(out_words["word"])) & 
                               (migrant["year"].isin(years)) &
                                (migrant["language"] == language_in)
                               ]
    return(migrated_from_out)



Words from english to spanish in the year range from 1850 to 1910, along with their data in spanish.

In [13]:
migrants("english","spanish",range(1850,1910))

Unnamed: 0,word,year,count,language,Rank
142,adela,1864,2992,spanish,4774.0
407,amanda,1868,5696,spanish,2033.0
949,aud,1857,2494,spanish,4215.0
990,australia,1884,5304,spanish,2832.0
1522,brothers,1895,3568,spanish,3795.0
1612,byron,1873,3254,spanish,2861.0
2086,chap,1900,9014,spanish,1997.0
2503,company,1902,3980,spanish,4651.0
2906,control,1879,3296,spanish,3563.0
3061,cost,1878,2872,spanish,4181.0
