In [1]:
import numpy as np 
import pandas as pd
import pdb
import requests

from Classifications import *

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("Meteorite_Landings.csv")
df.columns = ['name', 'id', 'nametype', 'recclass', 'mass', 'fall', 'year',
       'reclat', 'reclong', 'GeoLocation']

## Drop mass NaNs & 0.0

In [3]:
df = df[(df.mass > 0.0) & (df.mass.isna() == False)]
df.shape

(45566, 10)

## Format Year column

In [4]:
def get_year(x):
    try:
        return int(str(x[6:10]))
    except:
        return x
df.year = df.year.apply(get_year)

## Classifications

In [5]:
df = Classifications.classify_subclasses(df)

In [6]:
classification_df = df.copy()

save to csv

In [7]:
classification_df.to_csv("CSV_MASTERS/classifications.csv", index=False)

## Country Coordinates (removes 0.0 0.0 coordinates)

Excerpt those entries that are not the primary ones from antarctica (handle those below)

In [8]:
geo_df = df[(df.reclat.isna() == False) 
            & (df.reclong.isna() == False) 
            & (df.GeoLocation != '(0.0, 0.0)') # ocean placeholder
            & (df.GeoLocation != '(-71.5, 35.66667)') # antarctica 1
            & (df.GeoLocation != '(-84.0, 168.0)') # antarctice 2
           ].copy()


Handle main antarctic locations (add country/continent name and abreviation without API calls)

In [9]:
antarctic_df = df.loc[(df.GeoLocation == '(-71.5, 35.66667)') | (df.GeoLocation == '(-84.0, 168.0)')].copy()

country = np.array(["Antarctica" for x in range(antarctic_df.shape[0])])
country_abrv = np.array(["AQ" for x in range(antarctic_df.shape[0])])

antarctic_df['country'] = country
antarctic_df['country_abrv'] = country_abrv


Pull in the countries information for the coordinates and format for merge

In [10]:
countries_df = pd.read_csv("countries_master.csv")
countries_df.columns = ['id', 'reclat', 'reclong', 'country', 'country_abrv']
countries_df.drop(['reclat', 'reclong'], axis=1, inplace=True)

Combine country information with geo_df

In [11]:
geo_df = geo_df.merge(countries_df, on="id").copy()

add in our antarctic data

In [12]:
geo_df_final = pd.concat([geo_df, antarctic_df]).copy()


save to csv

In [13]:
geo_df_final.to_csv("CSV_MASTERS/geo.csv", index=False)

## Population Density By Country

read in from the website below

In [14]:
import requests
resp = requests.get("http://worldpopulationreview.com/countries/countries-by-density/")

In [15]:
pop_df = pd.read_html(resp.content)[0].drop(['Rank'], axis=1)

In [16]:
pop_df.columns = ['country', 'density_km_squared', 'density_mi_squared']

In [17]:
def convert_to_int(x):
    return int(x.split("/")[0].replace(",", ""))

pop_df.density_km_squared = pop_df.density_km_squared.apply(convert_to_int)
pop_df.density_mi_squared = pop_df.density_mi_squared.apply(convert_to_int)

In [18]:
pop_df = geo_df_final.merge(pop_df, on="country", how="left", )

In [19]:
pop_df.density_km_squared.fillna(0.0, inplace=True)
pop_df.density_mi_squared.fillna(0.0, inplace=True)

save to csv

In [20]:
pop_df.to_csv("CSV_MASTERS/population.csv", index=False)

## Population by country year

#### Population at 1900

In [26]:
resp = requests.get("https://en.wikipedia.org/wiki/List_of_countries_by_population_in_1900")
pop_1900 = pd.read_html(resp.content)[2]

In [27]:
pop_1900.reset_index(inplace=True)


In [28]:
pop_1900.columns=["country", "population_estimate", "population_percentage"]

ValueError: Length mismatch: Expected axis has 5 elements, new values have 3 elements

In [29]:
def parse_name(name):
    name = name.split("(")[0]
    name = name.split("[")[0]
    return name
pop_1900.country = pop_1900.country.apply(parse_name)
pop_1900.head()

AttributeError: 'DataFrame' object has no attribute 'country'

### Population at 1950, 2000, 2015

Get data about population at certain points in the 20th century and format dataframe

In [101]:
resp = requests.get("https://photius.com/rankings/world2050_rank.html")
pop_year_df = pd.read_html(resp.content)[0]
pop_year_df.drop(list(range(6)), inplace=True)
pop_year_df.drop([0], axis=1, inplace=True)
pop_year_df.reset_index(inplace=True)
pop_year_df.drop(["index", 5, 6], axis=1, inplace=True)
pop_year_df.columns = ['country', '1950_pop_thousands', '2000_pop_thousands', '2015_pop_thousands']
pop_year_df.drop(list(range(227, 237)), inplace=True)
pop_year_df.country = pop_year_df.country.apply(parse_name) # above for original parse_name function definition

Get the general name for each country given in our pop_df

In [102]:
country_list = list(pop_df.country.unique())

# Some country names will not match between datasets based on specific types of spellings. 
# There are not many, so they have been hard coded below.

# The name from the population year dataset as the key, what it should be to match as the value
special_names = {
    "Dem. Rep. of the Congo" : "Democratic Republic of the Congo",
    "Nigeria": "Nigeria", # otherwise matches to Niger
    "Czech Republic": "Czechia",
    "Lao Peoples's Dem. Republic": "Laos",
    "Myanmar": "Myanmar (Burma)",
}

def match_country_name(name):
    if name in special_names:
        return special_names[name]
    
    for country in country_list:
        if country in name:   
            return country
    return name + "*" # indicates that the country does not exist in the meteorite dataset
    
pop_year_df.country = pop_year_df.country.apply(match_country_name)

Some countries have repeats because name was found in terratory (or something similar). Drop later repeats of the country name and keep the first occurance (as the data set is sorted in such a way that this is likely to be the actual country). 

In [103]:
pop_year_df.drop_duplicates("country", keep='first', inplace=True)

Merge the new data about population at certain years with our pop_df

In [104]:
pop_merged_df = pop_df.merge(pop_year_df, on='country').copy()

Make sure all naming is clear between different population statistics

In [110]:
pop_merged_df.rename({
    "density_km_squared": "pop_density_km_squared", 
    "density_mi_squared": "pop_density_mi_squared"
    }, axis="columns", inplace=True)

Save the df to csv

In [112]:
pop_merged_df.to_csv("CSV_MASTERS/population_by_years.csv", index=False)