In [3]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import ttest_ind

## ANOVA (as a function of mass)

In [4]:
df = pd.read_csv("CSV_MASTERS/population.csv")
df.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation,major_classification,subclass_category,country,country_abrv,density_km_squared,density_mi_squared
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.775,6.08333,"(50.775, 6.08333)",Chondrite,L,Germany,DE,233.0,90.0
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.18333, 10.23333)",Chondrite,H,Denmark,DK,133.0,52.0
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.0,"(54.21667, -113.0)",Chondrite,E,Canada,CA,4.0,1.0
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.9,"(16.88333, -99.9)",Achondrite,Acapulcoite,Mexico,MX,64.0,25.0
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95,"(-33.16667, -64.95)",Chondrite,L,Argentina,AR,16.0,6.0


In [5]:
formula = 'mass ~ C(country) + C(major_classification) + density_km_squared + reclat + reclong'
lm = ols(formula, df).fit()
table = sm.stats.anova_lm(lm, typ=2)
print(table)

                               sum_sq       df          F        PR(>F)
C(country)               1.290254e+15    121.0  25.976511  0.000000e+00
C(major_classification)  1.573244e+14      4.0  95.813538  3.449987e-81
density_km_squared       3.958162e+10      1.0   0.096424  7.561661e-01
reclat                   4.140434e+10      1.0   0.100864  7.507971e-01
reclong                  1.850500e+11      1.0   0.450796  5.019619e-01
Residual                 1.351435e+16  32922.0        NaN           NaN


## T Test comparsion of major_classifications

In [6]:
df[df.major_classification == "Chondrite"].mass

0             21.00
1            720.00
2         107000.00
4            780.00
5           4239.00
6            910.00
7          30000.00
8           1620.00
9           1440.00
11         24000.00
12           779.00
13          1800.00
14          3000.00
16           160.00
17           700.00
18          6000.00
19          2000.00
20           625.00
22           700.00
23          3200.00
24           908.00
25          9251.00
26        228000.00
27         32000.00
28       2000000.00
30          6000.00
31          6400.00
33          3200.00
34           600.00
35         17900.00
            ...    
33012         66.91
33013         13.50
33014          1.58
33015         11.80
33016          6.37
33017          3.63
33018        595.24
33019         21.24
33020          3.99
33021        197.26
33022         10.27
33023          7.51
33024        136.31
33025         11.04
33026         26.93
33027          3.45
33028          9.21
33029          7.17
33030          5.57


In [7]:
def test_classifications(classification_names=("Chondrite", "Achondrite", "Iron", "Stony-Iron")):
    """
    Accepts the names of the classifications (or a subset of classifications)
    
    For each classification, runs a t test against all others to determine if their masses are from the same population
    
    Returns a dataframe with the results. 
    (For the aplha columns, True and False refers to the rejection of the null hypothesis) 
    """
    
    
    classification_mass = []
    # append a list of masses to the classification_mass list for each name
    for name in classification_names:
        classification_mass.append(df[df.major_classification == name].mass)
    
    test_results = [] # where we will be appending all our results
    alpha_list = (0.1, 0.05, 0.01) # the different alpha levels to test
    for i in range(len(classification_mass)):
        for j in range(i + 1, len(classification_mass)):
            samp1, samp2 = (classification_names[i],classification_names[j]) 
            # test each classification against the others. 
            t_stat, p_value = ttest_ind(classification_mass[i], classification_mass[j], equal_var=False)
            # given the p values, check against list of alphas
            alpha_1, alpha_05, alpha_01 = (p_value < alpha_list[0],p_value < alpha_list[1], p_value < alpha_list[2])
            #  append everything to the rest results as a tuple
            test_results.append((samp1, samp2, t_stat, p_value, alpha_1, alpha_05, alpha_01))
    # create and return a dataframe of the results 
    print(test_results)
    return pd.DataFrame(test_results, columns=("sample_1", "sample_2", "t_stat", "p_value", "alpha_1", "aplha_05", "alpha_01"))

In [8]:
test_classifications()

[('Chondrite', 'Achondrite', -1.5096329478856143, 0.13142787132952657, False, False, False), ('Chondrite', 'Iron', -4.2385868125953055, 2.466572917744776e-05, True, True, True), ('Chondrite', 'Stony-Iron', -3.0928439033035806, 0.0022332329223249533, True, True, True), ('Achondrite', 'Iron', -4.18794634829089, 3.073628476892621e-05, True, True, True), ('Achondrite', 'Stony-Iron', -2.8510400413529067, 0.004746793088997029, True, True, True), ('Iron', 'Stony-Iron', 3.448491037165745, 0.0005861271971294914, True, True, True)]


Unnamed: 0,sample_1,sample_2,t_stat,p_value,alpha_1,aplha_05,alpha_01
0,Chondrite,Achondrite,-1.509633,0.131428,False,False,False
1,Chondrite,Iron,-4.238587,2.5e-05,True,True,True
2,Chondrite,Stony-Iron,-3.092844,0.002233,True,True,True
3,Achondrite,Iron,-4.187946,3.1e-05,True,True,True
4,Achondrite,Stony-Iron,-2.85104,0.004747,True,True,True
5,Iron,Stony-Iron,3.448491,0.000586,True,True,True


## T Test Comparison of Continents to Number Meteorites Observed

Null Hypothesis = For any comparison, the number of meteorites comes from the same population

## T Test Comparison of country populations to number meteorites observed

In [89]:
country_names = list(df.country.value_counts().index)
hit_counts = list(df.country.value_counts())

pd.DataFrame(list(zip(country_names, hit_counts)), columns=["country", "num_strikes"]).to_csv(
    "CSV_MASTERS/strikes_per_country.csv", index=False
)