In [7]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.proportion import proportions_ztest

from scipy.stats import ttest_ind

import matplotlib.pyplot as plt
import seaborn as sns




## T Test comparsion of major_classifications

In [10]:
def test_classifications(classification_names=("Chondrite", "Achondrite", "Iron", "Stony-Iron")):
    """
    Accepts the names of the classifications (or a subset of classifications)
    
    For each classification, runs a t test against all others to determine if their masses are from the same population
    
    Returns a dataframe with the results. 
    (For the aplha columns, True and False refers to the rejection of the null hypothesis) 
    """
    
    
    classification_mass = []
    # append a list of masses to the classification_mass list for each name
    for name in classification_names:
        classification_mass.append(df[df.major_classification == name].mass)
    
    test_results = [] # where we will be appending all our results
    alpha_list = (0.1, 0.05, 0.01) # the different alpha levels to test
    for i in range(len(classification_mass)):
        for j in range(i + 1, len(classification_mass)):
            samp1, samp2 = (classification_names[i],classification_names[j]) 
            # test each classification against the others. 
            t_stat, p_value = ttest_ind(classification_mass[i], classification_mass[j], equal_var=False)
            # given the p values, check against list of alphas
            alpha_1, alpha_05, alpha_01 = (p_value < alpha_list[0],p_value < alpha_list[1], p_value < alpha_list[2])
            #  append everything to the rest results as a tuple
            test_results.append((samp1, samp2, t_stat, p_value, alpha_1, alpha_05, alpha_01))
    # create and return a dataframe of the results        
    return pd.DataFrame(test_results, columns=("sample_1", "sample_2", "t_stat", "p_value", "alpha 10%", "alpha 5%", "alpha 1%"))

In [11]:

# test_classifications().to_csv("tables/classification_test.csv", index=False)
test_classifications()

Unnamed: 0,sample_1,sample_2,t_stat,p_value,alpha 10%,alpha 5%,alpha 1%
0,Chondrite,Achondrite,-1.509633,0.131428,False,False,False
1,Chondrite,Iron,-4.238587,2.5e-05,True,True,True
2,Chondrite,Stony-Iron,-3.092844,0.002233,True,True,True
3,Achondrite,Iron,-4.187946,3.1e-05,True,True,True
4,Achondrite,Stony-Iron,-2.85104,0.004747,True,True,True
5,Iron,Stony-Iron,3.448491,0.000586,True,True,True


## T Test Comparison of Continents to Number Meteorites Observed (done in kate's branch)

Null Hypothesis = For any comparison, the number of meteorites comes from the same population

## Strikes Per Global Quadrant

In [13]:
def test_quadrants(df):
    ne = df[(df.reclat >= 0) & (df.reclong >= 0)].mass
    nw = df[(df.reclat >= 0) & (df.reclong <= 0)].mass
    sw = df[(df.reclat <= 0) & (df.reclong <= 0)].mass
    se = df[(df.reclat <= 0) & (df.reclong >= 0)].mass 
    
    quadrant_data = (ne, nw, sw, se)
    quadrant_names = ("NE", "NW", "SW", "SE")
    
    test_results = [] # where we will be appending all our results
    alpha_list = (0.1, 0.05, 0.01) # the different alpha levels to test
    
    for i in range(4):
        for j in range(i + 1, 4):
            samp1, samp2 = quadrant_names[i], quadrant_names[j]
            t_stat, p_value = ttest_ind(quadrant_data[i], quadrant_data[j], equal_var=False)
            alpha_1, alpha_05, alpha_01 = (p_value < alpha_list[0],p_value < alpha_list[1], p_value < alpha_list[2])
            
            test_results.append((samp1, samp2, t_stat, p_value, alpha_1, alpha_05, alpha_01))
    
    return pd.DataFrame(test_results, columns=["quadrant_1", "quadrant_2", "t_stat", "p_value", "alpha 10%", "alpha 5%", "alpha 1%"])
            

In [14]:
test_quadrants(df).to_csv("tables/landmass.csv", index=False)
test_quadrants(df)

Unnamed: 0,quadrant_1,quadrant_2,t_stat,p_value,alpha 10%,alpha 5%,alpha 1%
0,NE,NW,-2.541272,0.011108,True,True,False
1,NE,SW,-0.919997,0.357687,False,False,False
2,NE,SE,1.629069,0.103324,False,False,False
3,NW,SW,1.378149,0.168233,False,False,False
4,NW,SE,2.870973,0.00413,True,True,True
5,SW,SE,1.300079,0.193734,False,False,False


## Strikes per hemisphere

In [15]:
def test_hemisphere(df):
    north = df[df.reclat > 0].mass
    south = df[df.reclat < 0].mass
    
    t_stat, p_value = ttest_ind(north, south, equal_var=False)
    alpha_list = (0.1, 0.05, 0.01) # the different alpha levels to test
    
    alpha_1, alpha_05, alpha_01 = (p_value < alpha_list[0],p_value < alpha_list[1], p_value < alpha_list[2])
    
    
#     plt.bar(["North", "South"], [north.sum(), south.sum()])
#     sns.distplot(north)
#     sns.distplot(south)
    return pd.DataFrame([(t_stat, p_value, alpha_1, alpha_05, alpha_01)], columns=["t_stat", "p_value", "alpha_1", "alpha_05", "alpha_01"])
    
test_hemisphere(df)

Unnamed: 0,t_stat,p_value,alpha_1,alpha_05,alpha_01
0,2.923305,0.00347,True,True,True


Based on the p value, we can conclude that the mass of meteorites that land in the northern hemisphere vs those that land in the south ar different. 

## Proportion test of found to fell

In [16]:
def test_fell_sightings_proportions(df, country1, country2):
    # Excerpt the two countries from the dataframe
    ctry1 = df[df.country == country1]
    ctry2 = df[df.country == country2]
    
    # get the total number of sightings/findings in a country
    ctry1_total = len(ctry1.fall)
    ctry2_total = len(ctry2.fall)
    
    #get the total number of sightings while falling in each country
    ctry1_sightings = len(ctry1[ctry1.fall == 'Fell'])
    ctry2_sightings = len(ctry2[ctry2.fall == 'Fell'])
    
    # run the proportion test
    z_stat, p_value = proportions_ztest([ctry1_sightings, ctry2_sightings], [ctry1_total, ctry2_total])
    
    
    return pd.DataFrame([(country1, country2, z_stat, p_value)], columns=("Country_1", "Country_2", "z_stat", "p_value"))

    
test_fell_sightings_proportions(df, "United States", "China")
    

Unnamed: 0,Country_1,Country_2,z_stat,p_value
0,United States,China,-16.022224,8.939509e-58


## Proportion Test of sightings by range of years

In [18]:
def test_fall_sightings_dates(df, start_date, split_date, end_date):
    
    range_1 = df[(df.year >= start_date) & (df.year < split_date)]
    range_2 = df[(df.year >= split_date) & (df.year <= end_date )]
    
    range_1_name = f"{start_date} - {split_date - 1}"
    range_2_name = f"{split_date} - {end_date}"
    
    range_1_total = len(range_1.fall)
    range_2_total = len(range_2.fall)
    
    range_1_sightings = len(range_1[range_1.fall == "Fell"])
    range_2_sightings = len(range_2[range_2.fall == "Fell"])
    
    z_stat, p_value = proportions_ztest((range_1_sightings, range_2_sightings), (range_1_total, range_2_total))
                                        
    return pd.DataFrame(
        [(range_1_name, range_2_name, z_stat, p_value)],
        columns=("Date Range 1", "Date Range 2", "z-stat", "p-value")
)

In [19]:
test_fall_sightings_dates(df, 1950, 1990, 2010).to_csv('tables/proportion_fell_by_year.csv', index=False)
test_fall_sightings_dates(df, 1950, 1990, 2010)

Unnamed: 0,Date Range 1,Date Range 2,z-stat,p-value
0,1950 - 1989,1990 - 2010,9.6358,5.645074e-22


In [20]:
earth_strikes = pd.read_csv("CSV_MASTERS/GeoEarth_with_num_strikes")
earth_strikes.loc[121, "subregion"] = "Antarctica"
earth_strikes.loc[121, "region_wb"] = "Antarctica"

In [21]:
region = list(earth_strikes.region_wb.unique())

samples = []

for r in region:
    samples.append(earth_strikes[earth_strikes.region_wb == r].num_strikes)


    
test_results = []
alpha_list = (0.1, 0.05, 0.01) # the different alpha levels to test
for i in range(len(samples)):
    for j in range(i + 1, len(samples)):
        samp1, samp2 = samples[i], samples[j]
        t_stat, p_value = ttest_ind(samp1, samp2, equal_var=False)
        alpha_1, alpha_05, alpha_01 = (p_value < alpha_list[0],p_value < alpha_list[1], p_value < alpha_list[2])
        
        test_results.append((region[i], region[j], t_stat, p_value, alpha_1, alpha_05, alpha_01))
        
region_strikes = pd.DataFrame(test_results, columns=["Region 1", "Region 2", "t-stat", "p-value", "alpha 10%", "alpha 5%", "alpha 1%"])

region_strikes.dropna(inplace=True)

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [22]:
region_strikes.to_csv("tables/strikes_per_region.csv", index=False)
region_strikes

Unnamed: 0,Region 1,Region 2,t-stat,p-value,alpha 10%,alpha 5%,alpha 1%
0,Latin America & Caribbean,North America,-0.886507,0.534286,False,False,False
1,Latin America & Caribbean,Europe & Central Asia,1.238742,0.234426,False,False,False
2,Latin America & Caribbean,Middle East & North Africa,-0.960486,0.347235,False,False,False
3,Latin America & Caribbean,South Asia,1.056172,0.306036,False,False,False
4,Latin America & Caribbean,East Asia & Pacific,0.791141,0.438777,False,False,False
5,Latin America & Caribbean,Sub-Saharan Africa,1.220304,0.240944,False,False,False
8,North America,Europe & Central Asia,1.055162,0.482911,False,False,False
9,North America,Middle East & North Africa,0.594272,0.648478,False,False,False
10,North America,South Asia,1.034743,0.488864,False,False,False
11,North America,East Asia & Pacific,1.002038,0.498793,False,False,False


## The effect of population size on meteor sightings

Pull in population data and and subset the year 2000 +- 10.

In [162]:
population_df = pd.read_csv("CSV_MASTERS/population_by_years.csv")
population_df = population_df[(population_df.year >= 1990) & (population_df.year <= 2010)]

get the meteorite sightings per country within the time period

In [163]:
columns = [    
    "country",
    "2000_pop_thousands", 
]

strikes_per_country = pd.DataFrame(population_df.country.value_counts()).reset_index()
strikes_per_country.columns = ["country", "strikes"]
strikes_per_country
country_populations = population_df[columns].drop_duplicates(subset="country")

strikes_around_2000 = country_populations.merge(strikes_per_country, on="country").copy()[["country", "2000_pop_thousands", "strikes"]]

Divide the countries into two groups based on population. The division point is the median of the set populations

In [164]:
median_pop = strikes_around_2000['2000_pop_thousands'].median()
print(f"Smaller countries: < {median_pop * 1000} | Larger Countries: > {median_pop * 1000}")

Smaller countries: < 19742000.0 | Larger Countries: > 19742000.0


In [165]:
sm_countries_strikes = list(strikes_around_2000[strikes_around_2000["2000_pop_thousands"] >= median_pop].strikes)
large_countries_strikes = list(strikes_around_2000[strikes_around_2000["2000_pop_thousands"] < median_pop].strikes)


Compare the samples of meteorite sightings recorded by "small" countries to that of "larger" countries. 

In [166]:
ttest_ind(sm_countries_strikes, large_countries_strikes, equal_var=False)

Ttest_indResult(statistic=-0.9699220336147, pvalue=0.338249483094495)

based on the results of the t-test, it appears that the population of a country does not influence how many meteor strikes are recorded. 