# ___Imports___

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
# import plotly.express as px
# from geopy.geocoders import Nominatim
# import geopy as gp
# from datetime import datetime
# from glob import glob

# ___NOTES___

- Nasa data ranges from index 0:45715
- IMO data ranges from index 45716:end

## ___NASA data___

In [14]:
# load data for nasa
nasa_data = pd.read_csv('../data/meteorite-landings.csv')
print(nasa_data.columns)

nasa_data.head()
np.shape(nasa_data)

Index(['name', 'id', 'nametype', 'recclass', 'mass', 'fall', 'year', 'reclat',
       'reclong', 'GeoLocation'],
      dtype='object')


(45716, 10)

## ___IMO data___

In [12]:
# load data for imo
imo_data = pd.read_csv('../data/IMO/data/IMO9920.csv')
print(imo_data.columns)

imo_data.head()
np.shape(imo_data)

Index(['Session ID', 'Start Date', 'Observer ID', 'Submitter ID',
       'Actual Observer Name', 'Submitted by', 'City', 'Country', 'Latitude',
       'Longitude', 'Elevation'],
      dtype='object')


(42989, 11)

## ___Combining data___

In [28]:
# Combine the 2 datasets into 1
all_data = pd.concat([nasa_data, imo_data], sort=False, keys=['NASA', 'IMO'])
# Show columns, first 5 rows, and shape to verify combination
print(all_data.columns)
print(all_data.head)
np.shape(all_data)


Index(['name', 'id', 'nametype', 'recclass', 'mass', 'fall', 'year', 'reclat',
       'reclong', 'GeoLocation', 'Session ID', 'Start Date', 'Observer ID',
       'Submitter ID', 'Actual Observer Name', 'Submitted by', 'City',
       'Country', 'Latitude', 'Longitude', 'Elevation'],
      dtype='object')
<bound method NDFrame.head of                 name     id nametype     recclass      mass  fall    year  \
NASA 0        Aachen    1.0    Valid           L5      21.0  Fell  1880.0   
     1        Aarhus    2.0    Valid           H6     720.0  Fell  1951.0   
     2          Abee    6.0    Valid          EH4  107000.0  Fell  1952.0   
     3      Acapulco   10.0    Valid  Acapulcoite    1914.0  Fell  1976.0   
     4       Achiras  370.0    Valid           L6     780.0  Fell  1902.0   
...              ...    ...      ...          ...       ...   ...     ...   
IMO  42984       NaN    NaN      NaN          NaN       NaN   NaN     NaN   
     42985       NaN    NaN      NaN          NaN

(88705, 21)

# ___NOTES___

- Nasa data ranges from index 0:45715
- IMO data ranges from index 45716:end

## ___Cleaning___

In [29]:
# Drop unnecessary columns

all_data = all_data.drop(['name', 'nametype', 'GeoLocation', 'Observer ID', 'Submitter ID', 'Actual Observer Name', 'Submitted by'], axis=1)

In [31]:
# Confirmation of dropped columns
print(all_data.head)
print(np.shape(all_data))

<bound method NDFrame.head of                id     recclass      mass  fall    year    reclat    reclong  \
NASA 0        1.0           L5      21.0  Fell  1880.0  50.77500    6.08333   
     1        2.0           H6     720.0  Fell  1951.0  56.18333   10.23333   
     2        6.0          EH4  107000.0  Fell  1952.0  54.21667 -113.00000   
     3       10.0  Acapulcoite    1914.0  Fell  1976.0  16.88333  -99.90000   
     4      370.0           L6     780.0  Fell  1902.0 -33.16667  -64.95000   
...           ...          ...       ...   ...     ...       ...        ...   
IMO  42984    NaN          NaN       NaN   NaN     NaN       NaN        NaN   
     42985    NaN          NaN       NaN   NaN     NaN       NaN        NaN   
     42986    NaN          NaN       NaN   NaN     NaN       NaN        NaN   
     42987    NaN          NaN       NaN   NaN     NaN       NaN        NaN   
     42988    NaN          NaN       NaN   NaN     NaN       NaN        NaN   

            Session I

In [39]:
# Check for Null Values

all_data.isnull().sum()


id            42989
recclass      42989
mass          43120
fall          42989
year          43277
reclat        50304
reclong       50304
Session ID    45716
Start Date    45716
City          45728
Country       45728
Latitude      45728
Longitude     45728
Elevation     45728
dtype: int64

In [49]:
# merge IMO ids to NASA NaN ids
all_data['id'] = all_data['id'].replace(np.NaN, all_data['Session ID'][45716:])

In [52]:
# merge IMO latitudess to NASA NaN latitudes
all_data['reclat'] = all_data['reclat'].replace(np.NaN, all_data['Latitude'][45716:])

In [53]:
# merge IMO longitudes to NASA NaN longitudes
all_data['reclong'] = all_data['reclong'].replace(np.NaN, all_data['Longitude'][45716:])

In [138]:
# Convert the IMO Start Date to NASA year
for i in range(45716, np.shape(all_data)[0]):
    all_data['Start Date'][i] = all_data['Start Date'][i][:4]

all_data['Start Date'].tail

<bound method NDFrame.tail of NASA  0         NaN
      1         NaN
      2         NaN
      3         NaN
      4         NaN
               ... 
IMO   42984    2005
      42985    2005
      42986    2005
      42987    2005
      42988    2005
Name: Start Date, Length: 88705, dtype: object>

In [128]:
# merge IMO start dates to NASA NaN year
all_data['year'] = all_data['Start Date'].replace(np.NaN, all_data['year'][45716:])

KeyboardInterrupt: 

In [None]:
print(all_data.isnull().sum())

In [78]:
test = all_data['Start Date'].iloc[45716]
test[:4]

'2008'

In [33]:
# Rename columns

In [None]:
# Check for Catergorical Data (Encoding)

In [None]:
# Dimensionality Reduction (PCA)

In [None]:
# Standardization / Normalization

In [None]:
# Sampling (10%) for better time

In [36]:
all_data.iloc[45715]

id              30414
recclass         L3.7
mass              200
fall            Found
year             1976
reclat        33.9833
reclong      -115.683
Session ID        NaN
Start Date        NaN
City              NaN
Country           NaN
Latitude          NaN
Longitude         NaN
Elevation         NaN
Name: (NASA, 45715), dtype: object