In [1]:
import pandas as pd

df = pd.read_csv('../data/raw/Meteorite_Landings_20241105.csv')
df.head()

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.775,6.08333,"(50.775, 6.08333)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.18333, 10.23333)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.0,"(54.21667, -113.0)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.9,"(16.88333, -99.9)"
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95,"(-33.16667, -64.95)"


In [2]:
print(df.dtypes)

name            object
id               int64
nametype        object
recclass        object
mass (g)       float64
fall            object
year           float64
reclat         float64
reclong        float64
GeoLocation     object
dtype: object


In [3]:
# Check for missing values
df.isna().sum()

name              0
id                0
nametype          0
recclass          0
mass (g)        131
fall              0
year            291
reclat         7315
reclong        7315
GeoLocation    7315
dtype: int64

In [4]:
# Drop rows with missing values and unnecessary columns
df = df.dropna()
df = df.drop(columns='GeoLocation')
df = df.rename(columns={'reclat': 'latitude', 'reclong': 'longitude', 'recclass': 'class'})

In [5]:
# Convert 'year' to integer
df['year'] = df['year'].astype(int)

In [6]:
df.describe()

Unnamed: 0,id,mass (g),year,latitude,longitude
count,38115.0,38115.0,38115.0,38115.0,38115.0
mean,25343.139,15600.71,1989.993913,-39.596529,61.309359
std,17395.360205,628681.7,25.469892,46.17583,80.777583
min,1.0,0.0,860.0,-87.36667,-165.43333
25%,10831.5,6.63,1986.0,-76.71667,0.0
50%,21732.0,29.09,1996.0,-71.5,35.66667
75%,39887.5,187.29,2002.0,0.0,157.16667
max,57458.0,60000000.0,2101.0,81.16667,178.2


In [7]:
df = df[df['year'] <= 2024]

In [8]:
df.describe()

Unnamed: 0,id,mass (g),year,latitude,longitude
count,38114.0,38114.0,38114.0,38114.0,38114.0
mean,25342.304481,15601.11,1989.991001,-39.597567,61.310968
std,17394.825419,628690.0,25.463878,46.175991,80.778032
min,1.0,0.0,860.0,-87.36667,-165.43333
25%,10831.25,6.63,1986.0,-76.71667,0.0
50%,21731.5,29.085,1996.0,-71.5,35.66667
75%,39886.5,187.335,2002.0,0.0,157.16667
max,57458.0,60000000.0,2013.0,81.16667,178.2


In [9]:
df['class'].sort_values().unique()

array(['Acapulcoite', 'Acapulcoite/Lodranite', 'Acapulcoite/lodranite',
       'Achondrite-prim', 'Achondrite-ung', 'Angrite', 'Aubrite',
       'Aubrite-an', 'Brachinite', 'C', 'C1/2-ung', 'C2', 'C2-ung',
       'C3-ung', 'C3.0-ung', 'C4', 'C4-ung', 'C4/5', 'C5/6-ung', 'C6',
       'CB', 'CBa', 'CBb', 'CH/CBb', 'CH3', 'CI1', 'CK', 'CK3', 'CK3-an',
       'CK4', 'CK4-an', 'CK4/5', 'CK5', 'CK5/6', 'CK6', 'CM', 'CM-an',
       'CM1', 'CM1/2', 'CM2', 'CO3', 'CO3.0', 'CO3.1', 'CO3.2', 'CO3.3',
       'CO3.4', 'CO3.5', 'CO3.6', 'CO3.7', 'CO3.8', 'CR', 'CR-an', 'CR1',
       'CR2', 'CR2-an', 'CR7', 'CV2', 'CV3', 'CV3-an',
       'Chondrite-fusion crust', 'Chondrite-ung', 'Diogenite',
       'Diogenite-an', 'Diogenite-olivine', 'Diogenite-pm', 'E', 'E-an',
       'E3', 'E3-an', 'E4', 'E5', 'E5-an', 'E6', 'EH', 'EH-imp melt',
       'EH3', 'EH3/4-an', 'EH4', 'EH4/5', 'EH5', 'EH6', 'EH6-an', 'EH7',
       'EH7-an', 'EL-melt rock', 'EL3', 'EL4', 'EL4/5', 'EL5', 'EL6',
       'EL6/7', 'EL7', 'Ens

In [10]:
# Define the classification function
def classify_meteorites(meteorite_class):
    meteorite_class = meteorite_class.lower()  # Normalize to lowercase for case-insensitive matching

    # Achondrite category (specific matches first for accuracy)
    if any(keyword in meteorite_class for keyword in [
        'achondrite', 'acapulcoite', 'lodranite', 'eucrite', 'diogenite', 
        'howardite', 'angrite', 'aubrite', 'ureilite', 'brachinite', 
        'martian', 'lunar', 'winonaite']):
        return 'Achondrite'
    
    # Stony-iron category
    elif any(keyword in meteorite_class for keyword in ['stony-iron', 'mesosiderite', 'pallasite']):
        return 'Stony-iron'
    
    # Iron category
    elif 'iron' in meteorite_class:
        return 'Iron'
    
    # Chondrite category
    elif any(keyword in meteorite_class for keyword in [
        'chondrite', 'c', 'cb', 'ck', 'cm', 'co', 'cr', 'cv', 
        'h', 'l', 'll', 'r', 'k', 'e']) and not 'uncl' in meteorite_class:
        return 'Chondrite'
    
    # Uncategorized (if none of the above matches)
    else:
        return 'Uncategorized'

# Apply the classification function to your dataset
df['category'] = df['class'].apply(classify_meteorites)

In [11]:
# Check the unique values in the 'category' column again
df[df['category'] == 'Unknown']['class'].sort_values().unique()

array([], dtype=object)

In [12]:
df.head(10)

Unnamed: 0,name,id,nametype,class,mass (g),fall,year,latitude,longitude,category
0,Aachen,1,Valid,L5,21.0,Fell,1880,50.775,6.08333,Chondrite
1,Aarhus,2,Valid,H6,720.0,Fell,1951,56.18333,10.23333,Chondrite
2,Abee,6,Valid,EH4,107000.0,Fell,1952,54.21667,-113.0,Chondrite
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976,16.88333,-99.9,Achondrite
4,Achiras,370,Valid,L6,780.0,Fell,1902,-33.16667,-64.95,Chondrite
5,Adhi Kot,379,Valid,EH4,4239.0,Fell,1919,32.1,71.8,Chondrite
6,Adzhi-Bogdo (stone),390,Valid,LL3-6,910.0,Fell,1949,44.83333,95.16667,Chondrite
7,Agen,392,Valid,H5,30000.0,Fell,1814,44.21667,0.61667,Chondrite
8,Aguada,398,Valid,L6,1620.0,Fell,1930,-31.6,-65.23333,Chondrite
9,Aguila Blanca,417,Valid,L,1440.0,Fell,1920,-30.86667,-64.55,Chondrite


In [13]:
df_test = df[['class', 'category']]

In [14]:
df.to_csv('../data/interim/Meteorite_Landings_cleaned.csv', index=False)