In [56]:
# Reference https://www.kaggle.com/pylablanche/meteorite-landings-simple-analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(0)

### 1. Examine dateset

In [84]:
data = pd.read_csv('../input/meteorite-landings.csv')
data.columns = ['name', 'id', 'nametype', 'class', 'mass', 'fall', 'year', 'latitude', 'longitude', 'geo_location']
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
name            45716 non-null object
id              45716 non-null int64
nametype        45716 non-null object
class           45716 non-null object
mass            45585 non-null float64
fall            45716 non-null object
year            45428 non-null float64
latitude        38401 non-null float64
longitude       38401 non-null float64
geo_location    38401 non-null object
dtypes: float64(4), int64(1), object(5)
memory usage: 3.5+ MB


In [85]:
data.sample(10)

Unnamed: 0,name,id,nametype,class,mass,fall,year,latitude,longitude,geo_location
12445,Frontier Mountain 97010,10743,Valid,L4,23.1,Found,1997.0,-72.95306,160.4825,"(-72.953060, 160.482500)"
28764,Northwest Africa 4568,45518,Valid,Ureilite,123.0,Found,2005.0,,,
6125,Dar al Gani 360,5908,Valid,L6,87.0,Found,1997.0,27.9275,15.83117,"(27.927500, 15.831170)"
28431,Northwest Africa 4188,34460,Valid,L3-4,120.0,Found,2002.0,,,
7565,Dhofar 194,6978,Valid,H4,134.0,Found,2000.0,18.83667,54.27,"(18.836670, 54.270000)"
23431,Meteorite Hills 00738,15972,Valid,H5,36.37,Found,2000.0,-79.68333,155.75,"(-79.683330, 155.750000)"
11628,Elyria,10030,Valid,"Iron, IIIAB",10900.0,Found,1971.0,38.28,-97.365,"(38.280000, -97.365000)"
33434,Queen Alexandra Range 94648,20279,Valid,L5,23.7,Found,1994.0,-84.0,168.0,"(-84.000000, 168.000000)"
34063,Queen Alexandra Range 97452,20908,Valid,LL5,7.3,Found,1997.0,-84.0,168.0,"(-84.000000, 168.000000)"
42789,Yamato 86035,29541,Valid,L6,2.41,Found,1986.0,-71.5,35.66667,"(-71.500000, 35.666670)"


### **2. Handle missing values**

In [86]:
# discard any entries with lat/long of 0.0
data = data[(data.latitude != 0) & (data.longitude != 0)]

In [87]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39278 entries, 0 to 45715
Data columns (total 10 columns):
name            39278 non-null object
id              39278 non-null int64
nametype        39278 non-null object
class           39278 non-null object
mass            39147 non-null float64
fall            39278 non-null object
year            39018 non-null float64
latitude        31963 non-null float64
longitude       31963 non-null float64
geo_location    31963 non-null object
dtypes: float64(4), int64(1), object(5)
memory usage: 3.3+ MB


In [88]:
data.sample(10)

Unnamed: 0,name,id,nametype,class,mass,fall,year,latitude,longitude,geo_location
13472,Grove Mountains 020035,46634,Valid,L3,6.32,Found,2002.0,-73.10806,75.20417,"(-73.108060, 75.204170)"
7919,Dhofar 559,7320,Valid,H5,113.0,Found,2001.0,18.71848,54.22,"(18.718480, 54.220000)"
9474,Elephant Moraine 83236,7877,Valid,Eucrite-pmict,6.4,Found,1983.0,-76.32806,157.20194,"(-76.328060, 157.201940)"
14346,Grove Mountains 022318,49981,Valid,L6,5.79,Found,2003.0,-72.77333,75.33917,"(-72.773330, 75.339170)"
18501,LaPaz Icefield 03948,34765,Valid,LL6,12.5,Found,2003.0,,,
42721,Yamato 8425,29472,Valid,H6,7.4,Found,1984.0,-71.5,35.66667,"(-71.500000, 35.666670)"
20440,Lewis Cliff 86540,13468,Valid,"Iron, IAB-sLH",21.1,Found,1986.0,-84.27121,161.35133,"(-84.271210, 161.351330)"
7600,Dhofar 229,7013,Valid,H6,716.0,Found,2000.0,19.05667,54.575,"(19.056670, 54.575000)"
38123,Ultuna,24109,Valid,H,1900.0,Found,1944.0,59.81667,17.66667,"(59.816670, 17.666670)"
12581,Geologists Range 99119,10893,Valid,H6,11.4,Found,1999.0,-82.5,155.5,"(-82.500000, 155.500000)"


In [89]:
# count the number of missing values for each column
missing_values_count = data.isnull().sum()
missing_values_count

name               0
id                 0
nametype           0
class              0
mass             131
fall               0
year             260
latitude        7315
longitude       7315
geo_location    7315
dtype: int64

In [90]:
# count number of meteorites with invalid nametype
data.groupby('nametype').id.count()

nametype
Relict       75
Valid     39203
Name: id, dtype: int64

In [97]:
# discard samples that do not have a valid nametype
data = data[data['nametype'] == 'Valid']
data.groupby('nametype').id.count()

nametype
Valid    39203
Name: id, dtype: int64

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39203 entries, 0 to 45715
Data columns (total 10 columns):
name            39203 non-null object
id              39203 non-null int64
nametype        39203 non-null object
class           39203 non-null object
mass            39121 non-null float64
fall            39203 non-null object
year            38944 non-null float64
latitude        31893 non-null float64
longitude       31893 non-null float64
geo_location    31893 non-null object
dtypes: float64(4), int64(1), object(5)
memory usage: 3.3+ MB


In [99]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31685 entries, 0 to 45715
Data columns (total 10 columns):
name            31685 non-null object
id              31685 non-null int64
nametype        31685 non-null object
class           31685 non-null object
mass            31685 non-null float64
fall            31685 non-null object
year            31685 non-null float64
latitude        31685 non-null float64
longitude       31685 non-null float64
geo_location    31685 non-null object
dtypes: float64(4), int64(1), object(5)
memory usage: 2.7+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
