In [1]:
#Dependencies

import pandas as pd

In [2]:
#Store filepaths in variables

endangered_data = 'endangered_status_data.csv'
structure_data = 'language_structure_data.csv'
time_data = 'deaths_data.csv'

In [3]:
#Based on researching the endangered languages database, the CSV columns are as follows: 
#1) Location_code
#2) code_val 
#3) Primary Name 
#4) alternate_names
#5) endangerment_level 
#6) num_speakers
#7) classification
#8) dialect_varieties
#9) Notes?
#10) Public_comment
#11) locations_places
#12) Region?
#13) Coordinates

#Read in DF number 1: Endangerment of languages.

endangered_df = pd.read_csv(endangered_data,
                names=['Location Code', 'Code Value', 'Language', 'Alternate Names', 'Endangerment Level',
                       'Number of Speakers', 'Classification', 'Dialect Varieties', 'Notes', 'Public Comment', 
                       'Location Places', 'Region', 'Coordinates'])



In [4]:
#Drop unnecessary columns

final_endangered_df = endangered_df.drop(['Location Code', 'Code Value', 'Alternate Names', 'Classification', 'Dialect Varieties',
                   'Notes', 'Public Comment', 'Location Places'], axis = 1)

final_endangered_df.head()

Unnamed: 0,Language,Endangerment Level,Number of Speakers,Region,Coordinates
0,!Xun,"Vulnerable (20 percent certain, based on the e...","14,000-18,000",Africa,"-28.74358,23.983154; -17.560247, 18.050537; -1..."
1,'Ôrôê,"Endangered (20 percent certain, based on the e...",590,Pacific,"-21.4223,165.4678"
2,(Lower) Tanana,"Critically Endangered (80 percent certain, bas...",25,North America,"65.157778, -149.37;64.521111, -146.980556;64.5..."
3,Aasáx,Dormant,0,Africa,"-5.1948,37.738"
4,Abaga,"Critically Endangered (20 percent certain, bas...",5,Pacific,"-6.1883,145.5976"


In [5]:
#Get every set of coordinates in a separate column for each language 


coordinates_df = endangered_df.Coordinates.str.split(";", expand=True)
coordinates_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,"-28.74358,23.983154","-17.560247, 18.050537","-19.621892, 20.253296","-21.749296, 19.896240",,,
1,"-21.4223,165.4678",,,,,,
2,"65.157778, -149.37","64.521111, -146.980556","64.558056,-149.090556",,,,
3,"-5.1948,37.738",,,,,,
4,"-6.1883,145.5976",,,,,,


In [6]:
#There are up to 7 different coordinate locations per language. 
#Rename coordinates columns from 0-6 to "Coord 1 - Coord 7"

coords_renamed_df = coordinates_df.rename(index=str, columns=
                      {0: "coord_primary",
                      1: "coord_secondary",
                      2: "Coord 3",
                      3: "Coord 4",
                      4: "Coord 5",
                      5: "Coord 6",
                      6: "Coord 7"})

In [7]:
#Pull out the primary coordinates for each language and separate them into different columns

primary_coords_df = coords_renamed_df.coord_primary.str.split(",", n= -1, expand=True)
primary_coords_df = primary_coords_df.rename(columns={
                        0: "Primary Latitude",
                        1: "Primary Longitude"})
primary_coords_df.head()

Unnamed: 0,Primary Latitude,Primary Longitude
0,-28.74358,23.983154
1,-21.4223,165.4678
2,65.157778,-149.37
3,-5.1948,37.738
4,-6.1883,145.5976


In [17]:
#Drop dirty data that has rogue " marks at the following indexes:

drop_df = primary_coords_df.drop(primary_coords_df.index[1087])
drop_df_2 = drop_df.drop(drop_df.index[2192])
drop_df_3 = drop_df_2.drop(drop_df_2.index[2614])
drop_df_4 = drop_df_3.drop(drop_df_3.index[2767])


In [43]:
#Turn data in the latitude and longitude columns into numbers

clean_lat = drop_df_4['Primary Latitude'] = pd.to_numeric(drop_df_4['Primary Latitude'])
clean_long = drop_df_4['Primary Longitude'] = pd.to_numeric(drop_df_4['Primary Longitude'])



In [44]:
#Concatenate the endangered_df with the coordinates_df for the final product
#Remove the "endangerement level" commentary by splitting the string at the parentheses

final_en_df = pd.concat([final_endangered_df, drop_df_4,], sort=False)
final_en_df['Endangerment Level'] = final_en_df['Endangerment Level'].str.split("(",
        n = -1, expand=True)

#Drop the coordinates column that has multiple and confusing lon/lats 
final_en_df = final_en_df.drop(['Coordinates'], axis=1)

In [45]:
#Figure out why the dang lat and long aren't showing as values
final_en_df.head()

Unnamed: 0,Language,Endangerment Level,Number of Speakers,Region,Primary Latitude,Primary Longitude
0,!Xun,Vulnerable,"14,000-18,000",Africa,,
1,'Ôrôê,Endangered,590,Pacific,,
2,(Lower) Tanana,Critically Endangered,25,North America,,
3,Aasáx,Dormant,0,Africa,,
4,Abaga,Critically Endangered,5,Pacific,,


In [46]:
#THINGS TO DO:

#Fix the NaN values in the Primary Latitude and Primary Longitude columns of final_en_df

#Import structure and time csv files

#Clean structure data if necessary 

#Join structure and time data to the final_en_df on 'Language'

#Export all data into a giant CSV file

#JSONify all data into a giant JSON file