In [2]:
#Dependencies

import pandas as pd
import re

In [3]:
#Store filepaths in variables

endangered_data = 'endangered_status_data.csv'
structure_data = 'language_structure_data.csv'
time_data = 'deaths_data.csv'
UNESCO_data = 'language_status_UNESCO.csv'

In [4]:
#Based on researching the endangered languages database, the CSV columns are as follows: 
#1) Location_code
#2) code_val 
#3) Primary Name 
#4) alternate_names
#5) endangerment_level 
#6) num_speakers
#7) classification
#8) dialect_varieties
#9) Notes?
#10) Public_comment
#11) locations_places
#12) Region?
#13) Coordinates

#Read in DF number 1: Endangerment of languages.

endangered_df = pd.read_csv(endangered_data,
                names=['Location Code', 'Code Value', 'Language', 'Alternate Names', 'Endangerment Level',
                       'Number of Speakers', 'Classification', 'Dialect Varieties', 'Notes', 'Public Comment', 
                       'Location Places', 'Region', 'Coordinates'],
                keep_default_na=False, na_values=[""])



In [5]:
#Drop unnecessary columns

final_endangered_df = endangered_df.drop(['Location Code', 'Alternate Names', 'Classification', 'Dialect Varieties',
                   'Notes', 'Public Comment', 'Location Places'], axis = 1)


In [6]:
#Get every set of coordinates in a separate column for each language 
coordinates_df = endangered_df.Coordinates.str.split(";", expand=True)


In [7]:
#There are up to 7 different coordinate locations per language. 
#Rename coordinates columns from 0-6 to "Coord 1 - Coord 7"

coords_renamed_df = coordinates_df.rename(index=str, columns=
                      {0: "coord_primary",
                      1: "coord_secondary",
                      2: "Coord 3",
                      3: "Coord 4",
                      4: "Coord 5",
                      5: "Coord 6",
                      6: "Coord 7"})

In [8]:
#Pull out the primary coordinates for each language and separate them into different columns

primary_coords_df = coords_renamed_df.coord_primary.str.split(",", n= -1, expand=True)
primary_coords_df = primary_coords_df.rename(columns={
                        0: "Primary Latitude",
                        1: "Primary Longitude"})


lat_col = primary_coords_df['Primary Latitude'].tolist()
lon_col = primary_coords_df['Primary Longitude'].tolist()

In [9]:
final_endangered_df['Latitude'] = lat_col
final_endangered_df['Longitude'] = lon_col

final_en_df = final_endangered_df


In [10]:
#Remove the "endangerement level" commentary by splitting the string at the parentheses

final_en_df['Endangerment Level'] = final_en_df['Endangerment Level'].str.split("(",
        n = -1, expand=True)

final_en_df.head()

Unnamed: 0,Code Value,Language,Endangerment Level,Number of Speakers,Region,Coordinates,Latitude,Longitude
0,knw,!Xun,Vulnerable,"14,000-18,000",Africa,"-28.74358,23.983154; -17.560247, 18.050537; -1...",-28.74358,23.983154
1,bpk,'Ôrôê,Endangered,590,Pacific,"-21.4223,165.4678",-21.4223,165.4678
2,taa,(Lower) Tanana,Critically Endangered,25,North America,"65.157778, -149.37;64.521111, -146.980556;64.5...",65.157778,-149.37
3,aas,Aasáx,Dormant,0,Africa,"-5.1948,37.738",-5.1948,37.738
4,abg,Abaga,Critically Endangered,5,Pacific,"-6.1883,145.5976",-6.1883,145.5976


In [11]:
#Clean up the Number of Speakers column by splitting ranges and taking just the lowest number
final_en_df['Number of Speakers'] = final_en_df['Number of Speakers'].str.split("-", n=-1, expand=True)
final_en_df.head()

Unnamed: 0,Code Value,Language,Endangerment Level,Number of Speakers,Region,Coordinates,Latitude,Longitude
0,knw,!Xun,Vulnerable,14000,Africa,"-28.74358,23.983154; -17.560247, 18.050537; -1...",-28.74358,23.983154
1,bpk,'Ôrôê,Endangered,590,Pacific,"-21.4223,165.4678",-21.4223,165.4678
2,taa,(Lower) Tanana,Critically Endangered,25,North America,"65.157778, -149.37;64.521111, -146.980556;64.5...",65.157778,-149.37
3,aas,Aasáx,Dormant,0,Africa,"-5.1948,37.738",-5.1948,37.738
4,abg,Abaga,Critically Endangered,5,Pacific,"-6.1883,145.5976",-6.1883,145.5976


In [12]:
#Clean up the Number of Speakers column by splitting ranges and taking just the lowest number (squiggly)
final_en_df['Number of Speakers'] = final_en_df['Number of Speakers'].str.split("~", n=-1, expand=True)                                                                         

#Delete all the weird characters and stuff out of Number of Speeakers column
final_en_df['Number of Speakers'] = final_en_df['Number of Speakers'].str.replace('\W', "")

#Drop the coordinates column that has multiple and confusing lon/lats 
final_en_df = final_en_df.drop(['Coordinates'], axis=1)

final_en_df.head()

Unnamed: 0,Code Value,Language,Endangerment Level,Number of Speakers,Region,Latitude,Longitude
0,knw,!Xun,Vulnerable,14000,Africa,-28.74358,23.983154
1,bpk,'Ôrôê,Endangered,590,Pacific,-21.4223,165.4678
2,taa,(Lower) Tanana,Critically Endangered,25,North America,65.157778,-149.37
3,aas,Aasáx,Dormant,0,Africa,-5.1948,37.738
4,abg,Abaga,Critically Endangered,5,Pacific,-6.1883,145.5976


In [13]:
#Export as CSV
final_en_df.to_csv(r'raw data', index=False)

In [16]:
#Export to JSON
final_en_df.to_json(r'raw data.json')

In [14]:
#Read in the strucure data, only take in the columns we want, rename the columns so we
#can join on language and distinguish which lats/lons are from which dataset
structure_df = pd.read_csv(structure_data,
               keep_default_na=False, na_values=[""])
structure_df = structure_df[['Name', 'iso_code','latitude', 'longitude', 'genus',
                            'family', 'countrycodes']].rename(columns={
    'Name': 'Language',
    'iso_code': 'Code Value',
    'latitude': 'lat_str',
    'longitude': 'lon_str',})

structure_df.head()



Unnamed: 0,Language,Code Value,lat_str,lon_str,genus,family,countrycodes
0,Arapesh (Abu),,-3.45,142.95,Kombio-Arapesh,Torricelli,PG
1,Aari,aiw,6.0,36.583333,South Omotic,Afro-Asiatic,ET
2,Abau,aau,-4.0,141.25,Upper Sepik,Sepik,PG
3,Arabic (Chadian),shu,13.833333,20.833333,Semitic,Afro-Asiatic,TD
4,Abidji,abi,5.666667,-4.583333,Kwa,Niger-Congo,CI


In [None]:
#Export the language structure dataframe as a separate csv file
structure_df.to_csv(r'raw data', index=False)

In [18]:
#Create json file out of structure dataframe
structure_df.to_json(r'language_structure.json')

In [21]:
#Read UNESCO dataset into dataframe
UNESCO_status = pd.read_csv(UNESCO_data)

In [22]:
#Create json file out of UNESCO dataframe
UNESCO_status.to_json(r'language_status_UNESCO.json')

In [23]:
#Create json file out of deaths dataframe
time_data = pd.read_csv(time_data)

In [24]:
time_data.to_json(r'deaths.json')

In [None]:
#Export the endangered status dataframe as a saparate csv file

In [None]:
##We will start with visualizing each data set independently. If we have time, we will consider merges##

#INNER MERGE on Language --> 668 records were able to merge on 
#Language from the structures dataset and the endangered dataset
#struc_en_df = pd.merge(structure_df, final_en_df, how='outer', on="Code Value")
#struc_en_df = structure_df.merge(final_en_df.drop_duplicates(subset=['Code Value']), how='inner')

#inner join gives us 594 records when joining on code value
#left join gives us the exact length of the structure DF but it's got a lot of missing data
#outer join gives us double the fun with 5433 records, that can't be right.
#len(struc_en_df)


#Export combined structure and endangered datasets to csv file 
#struc_en_df.to_csv(r'raw data', index=False)

##We will start with visualizing each data set independently. If we have time, we will consider merges##
#Read in time data we want (century and language so we can merge on language)
#time_df = pd.read_csv(time_data)
#time_df = time_df[['Language', 'Century of Death']]

#Merge time data with structure_data ONLY
#In theory, endangered status data and dead language data should be mutually exclusive
#death_structure_df = pd.merge(structure_df, time_df, how="inner", on="Language")

#Export to csv file --> There are only 75 records that match
#death_structure_df.to_csv(r'raw data', index=False)