# **Welcome to my IBM Data Science Capstone Project**

##### This project is based on NASA Meteorite Landings dataset, which you can find right here: https://data.nasa.gov/Space-Science/Meteorite-Landings/gh4g-9sf

# **Import Dependencies**

In [26]:
import mysql.connector
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.options.mode.chained_assignment = None 

import folium
# Import folium MarkerCluster plugin
from folium.plugins import MarkerCluster
# Import folium MousePosition plugin
from folium.plugins import MousePosition
# Import folium DivIcon plugin
from folium.features import DivIcon
from folium import plugins


from geopy.geocoders import Nominatim
import pycountry_convert as pc

from pprint import pprint
from typing import Tuple 

import dash_html_components as html

import dash                                     # pip install dash
from dash import dcc, Output, Input
import plotly.express as px
import dash_bootstrap_components as dbc
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# **Import data**
##### Here we try to import data from localhost, if you would like to follow my steps remember to change user, password and database name in code below.

In [4]:
try:
    mydb = mysql.connector.connect(
        host = "localhost", ## local host name
        user = "root", ## root name
        passwd = "", ## password if you set one 
        database = "meteorite_landings" ## database name 
    )
    
    query  = "SELECT * FROM meteorite_landings;"
    df = pd.read_sql(query,mydb)
    mydb.close() #close the connection
except Exception as e:
    mydb.close()
    print(str(e))

df.head()

  df = pd.read_sql(query,mydb)


Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,0,50.775,6.08333,"(50.775, 6.08333)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951,56.1833,10.2333,"(56.18333, 10.23333)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952,54.2167,-113.0,"(54.21667, -113.0)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976,16.8833,-99.9,"(16.88333, -99.9)"
4,Achiras,370,Valid,L6,780.0,Fell,1902,-33.1667,-64.95,"(-33.16667, -64.95)"


# **Process Data**

### In this segment we will process our database

In [5]:
## Drop rows with GeoLocation='(0.0, 0.0)'
df = df[df.GeoLocation != '(0.0, 0.0)']
df = df[df.year != 0]

# Change mass from gram into kilogram 
df['mass (g)'] = (df['mass (g)']/1000).round(2)

## Rename dataframe
df.rename(columns={'name':'NAME', 'id':'ID', 'nametype':'NAME_TYPE', 'recclass':'CLASS','mass (g)':'MASS [kg]', 'fall':'FALL', 'year':'YEAR', 'reclat':'LAT','reclong':'LONG', 'GeoLocation':'COORDINATE'}, inplace=True)

### Converting coordinates into continents using geopy

##### in order to convert coordinates to continents, we need to use the API provided by googol i.e. geopy. This API allows us to send one request per second limited to about 2,500 requests, that's why df4stat['MASS [kg]']>=2

In [6]:
df4stat = df
df4stat = df[df['MASS [kg]'] >= 2]
df4stat = df4stat.assign(COUNTRY='Nan', CONTINENT='Nan')
df4stat.shape

(2268, 12)

In [7]:
continent_dict = { # we will use it to obrain full continent name 
    "NA": "North America",
    "SA": "South America",
    "AS": "Asia",
    "AF": "Africa",
    "OC": "Oceania",
    "EU": "Europe",
    "AQ" : "Antarctica"
}

geolocator = Nominatim(user_agent='IBMDataScienceCapstone') # project name

for idx, row in df4stat.iterrows(): # iteration through all rows of df4stat, as far as I know vectorization in this case does not work, so there is no possibility of optimization 

    location = geolocator.reverse([row['LAT'], row['LONG']], exactly_one=True, language='en')

    if location is None: # sometimes geopy got problem with Antarctica coordinates 
        df4stat.loc[idx, 'COUNTRY'] = "Antarctica" # I know Antarctica is not a country but give me a break
        df4stat.loc[idx, 'CONTINENT'] = "Antarctica"
    else:
        address = location.raw['address'] # in location.raw you can find a lot of information about given coordinates, but we only need country and continent
        df4stat.loc[idx, 'COUNTRY'] = address.get('country', '')
        
        continent_code = address.get('country_code', '').upper() # oppercase country code is need for country_alpha2_to_continent_code method to obtain continent name
        df4stat.loc[idx, 'CONTINENT']  = continent_dict[pc.country_alpha2_to_continent_code(continent_code)] 
df4stat.head()

Unnamed: 0,NAME,ID,NAME_TYPE,CLASS,MASS [kg],FALL,YEAR,LAT,LONG,COORDINATE,COUNTRY,CONTINENT
2,Abee,6,Valid,EH4,107.0,Fell,1952,54.2167,-113.0,"(54.21667, -113.0)",Canada,North America
5,Adhi Kot,379,Valid,EH4,4.24,Fell,1919,32.1,71.8,"(32.1, 71.8)",Pakistan,Asia
11,Aïr,424,Valid,L6,24.0,Fell,1925,19.0833,8.38333,"(19.08333, 8.38333)",Niger,Africa
15,Akwanga,432,Valid,H,3.0,Fell,1959,8.91667,8.43333,"(8.91667, 8.43333)",Nigeria,Africa
16,Akyumak,433,Valid,"Iron, IVA",50.0,Fell,1981,39.9167,42.8167,"(39.91667, 42.81667)",Turkey,Asia


##### if you want to process the whole dataset you can try to be sneaky by chopping up data, randomize Nominatim and sleeping, I leave my attempt below.

In [8]:
# import random
# from time import sleep

# def chunker(seq, size):
#     return (seq[pos:pos + size] for pos in range(0, len(seq), size))
#
# wait_time = 1
# time.sleep(random.randint(1*100,wait_time_batch*100)/100) 
# batch_number = len(df4stat.index)%2000
# rV = []
# for i in chunker(df4stat, batch_number):
#     geolocator = Nominatim(user_agent='trial'+str(random.randint(0,1000))) 
#     for idx, row in i.iterrows():   
#         location = geolocator.reverse([row['LAT'], row['LONG']], exactly_one=True)
#         if location is None:
#             i.loc[idx, 'COUNTRY'] = "Antarctica"
#             i.loc[idx, 'CONTINENT'] = "Antarctica"
#         else:
#             address = location.raw['address']
#             i.loc[idx, 'CONTINENT'] = address.get('country', '').upper()
#
#             continent_code = address.get('country_code', '').upper()
#             i.loc[idx, 'CONTINENT']  = continent_dict[pc.country_alpha2_to_continent_code(continent_code)] 
#             time.sleep(random.randint(1*100,wait_time*100)/100)     
#     rV.append(i)
# new_df = pd.DataFrame(np.reshape(rV,df4stat.shape))
# new_df.head()

##### Good pracric is to save processed dataset ( casestudy: google banned me because I called their API too many times )

In [12]:
df4stat.to_csv('processed_meteorite_landings.csv')

# **Data visualization**

##### now lets create folium map 

In [21]:
map = folium.Map(location=[40, 30], zoom_start=2)

for idx, row in df4stat.iterrows():

    # html for popup layout
    html = ''' 
            <html>
                <body>
                    <div>
                        <p> Name:'''+ row['NAME'] +'''<br> Year:'''+ str(row['YEAR']) + '''<br> ID:'''+ str(row['ID']) + '''</p>
                    </div>
                </body>
            </html>  
            ''' 

    radius = row['MASS [kg]'] * 10
    circle = folium.Circle([row['LAT'], row['LONG']], radius=radius, color='red', stroke=False, fill=True, fill_opacity=0.75).add_child(folium.Popup(html=html, max_width=800))
    map.add_child(circle)
    
minimap = plugins.MiniMap(toggle_display=True, width=300, height=100) # this will add minimap
map.add_child(minimap)
map

##### next step is to create interactive data visualization using dash

In [22]:
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.LUX]) # LUX is themes of our dashboard

df4dash = df4stat.groupby(['CONTINENT', 'YEAR'])['ID'].size().reset_index(name='COUNT') # additional dataframe for dash

ds_min = df4dash['YEAR'].min()
ds_max = df4dash['YEAR'].max() 
marks = { i: { "label": f"{i}", "style": {"transform": "rotate(0deg)", "white-space": "nowrap"} } for i in range(ds_min, ds_max, 10) } # marks for slider 

df4dash.head()

Unnamed: 0,CONTINENT,YEAR,COUNT
0,Africa,1903,4
1,Africa,1907,1
2,Africa,1909,1
3,Africa,1910,1
4,Africa,1911,1


In [35]:
app.layout = html.Div([

    # headline setup
    html.H1( 
        'Meteorite Landings from NASA dataset', 
        style={'textAlign': 'center','color':'#503D36','font-size':75} 
    ),

    html.Div([

        # Checklist
        html.Div([
            dcc.Checklist(
                        id='my-checklist', value=df4dash['CONTINENT'].unique(),
                        options=[{'label': i, 'value': i} for i in sorted(df4stat['CONTINENT'].unique())],
                        style={'font-size': '20px'}, 
                        labelClassName='mr-3',
                        inline=True
                    )
            ],style={'width':'50%', "margin": "15px"}
        ),

        # Slider 
        html.Div([
            html.Label("Select Year", htmlFor="years", style={'font-size': '20px', 'width':'50%'}),
            dcc.RangeSlider(
                        id='my-slider',
                        min=ds_min, max=ds_max, step=10, 
                        value=[df4stat['YEAR'].min(), df4stat['YEAR'].max()], # value range
                        updatemode='drag',
                        tooltip={"placement": 'bottom', "always_visible": True},
                        marks=marks, # slider marks that we have already set
                        ),
                ], style={'width':'50%'})], 
            style={'display':'flex'}
        ),
    
        # Graphs: Histogram, Heatmap
        html.Div([
            dcc.Graph(id='my-hist', figure={}, style={'width':'50%'}),
            dcc.Graph(id='my-heat', figure={}, style={'width':'50%'})
        ],style={'display':'flex'}),
        
        # Dropdown
        dcc.Dropdown(
                    id='my-dropdown', 
                    value=['North America','Europe', 'Asia'], # initial values
                    multi=True, # allows uset to pick multiple values
                    options=[{'label': x, 'value': x} for x in df4stat['CONTINENT'].unique()] # options to chose 
                ),

        # Graphs: Histogram, Heatmap
        html.Div([
            dcc.Graph(id='my-pie', figure={},style={'width':'50%'}, className='six columns'),
            dcc.Graph(id='my-scatter', figure={}, clickData=None, hoverData=None,
                    config={
                        'staticPlot': False,     # False - Graph is static, so you can't for example save image
                        'scrollZoom': True,      # True - Allows you to zoom your graph
                        'doubleClick': 'reset',  # 'reset' - Reset image zoom on double click 
                        'showTips': False,       # Tips for users
                        'displayModeBar': True,  # Mode bar
                        'watermark': True,       # plotly watermark 
                        'modeBarButtonsToRemove': ['select2d'] # Remove mode bar features
                        },style={'width':'50%'},
                    className='six columns'
                )], 
            style={'display':'flex'}
        ),
])

In [30]:
@app.callback(
    Output(component_id='my-hist', component_property='figure'),
    [Input(component_id='my-checklist', component_property='value')]
)
def update_hist(continents):
    df_hist = df4stat.groupby(['CONTINENT'])['ID'].count().reset_index()
    df_hist = df_hist[df_hist['CONTINENT'].isin(continents)]
    print(continents)
    fig = px.bar(x=df_hist['CONTINENT'], y=df_hist['ID'], labels={'x':'Continent', 'y':'Count'})
    return fig

In [31]:
@app.callback(
    Output(component_id='my-heat', component_property='figure'),
    [Input(component_id='my-slider', component_property='value')]
)
def update_graph(years):
    dff = df4stat[df4stat['YEAR']>=years[0]]
    dff = dff[dff['YEAR']<=years[1]]
    dff = dff.groupby(['CONTINENT', 'YEAR'])['ID'].count().reset_index()


    dff = dff.pivot(index='CONTINENT', columns='YEAR')['ID'].fillna(0)
    fig=px.imshow(dff, x=dff.columns , y=dff.index)
    fig.update_layout(title=f"Heat map for years: {years}", xaxis_title='year', yaxis_title='cont')
    return (fig)

In [32]:
@app.callback(
    Output(component_id='my-scatter', component_property='figure'),
    Input(component_id='my-dropdown', component_property='value'),
)
def update_graph(continent_chosen):
    dff = df4dash[df4dash.CONTINENT.isin(continent_chosen)]
    fig = px.scatter(data_frame=dff, x='YEAR', y='COUNT', color='CONTINENT')
    return fig

In [33]:
# Dash version 1.16.0 or higher
@app.callback(
    Output(component_id='my-pie', component_property='figure'),
    Input(component_id='my-scatter', component_property='hoverData'),
    Input(component_id='my-scatter', component_property='clickData'),
    Input(component_id='my-scatter', component_property='selectedData'),
    Input(component_id='my-dropdown', component_property='value')
)
def update_side_graph(hov_data, clk_data, slct_data, continent_chosen):
    if hov_data is None:
        dff2 = df4dash[df4dash['YEAR'] == ds_min]
        dff2 = df4dash[df4dash.CONTINENT.isin(continent_chosen)]
        fig2 = px.pie(data_frame=dff2, values='COUNT', names='CONTINENT',
                      title=f'Year: {ds_min}')
        return fig2
    else:
        dff2 = df4dash[df4dash.CONTINENT.isin(continent_chosen)]
        hov_year = hov_data['points'][0]['x']
        dff2 = dff2[dff2['YEAR'] == hov_year]
        fig2 = px.pie(data_frame=dff2, values='COUNT', names='CONTINENT', title=f'Year: {hov_year}')
        return fig2


In [None]:
if __name__ == '__main__':
    app.run_server()

# **Prediction model**

In [39]:
df4pred = pd.get_dummies(df, columns = ['FALL'])
df4pred.head()

Unnamed: 0,NAME,ID,NAME_TYPE,CLASS,MASS [kg],YEAR,LAT,LONG,COORDINATE,FALL_Fell,FALL_Found
1,Aarhus,2,Valid,H6,0.72,1951,56.1833,10.2333,"(56.18333, 10.23333)",True,False
2,Abee,6,Valid,EH4,107.0,1952,54.2167,-113.0,"(54.21667, -113.0)",True,False
3,Acapulco,10,Valid,Acapulcoite,1.91,1976,16.8833,-99.9,"(16.88333, -99.9)",True,False
4,Achiras,370,Valid,L6,0.78,1902,-33.1667,-64.95,"(-33.16667, -64.95)",True,False
5,Adhi Kot,379,Valid,EH4,4.24,1919,32.1,71.8,"(32.1, 71.8)",True,False


##### Lets pick features and target

In [41]:
features = ['YEAR', 'MASS [kg]', 'FALL_Fell',	'FALL_Found']
target = ['LAT', 'LONG']

meteorites_df = df4pred.dropna(subset=features+target)  # Remove rows with missing values
X = meteorites_df[features]
y = meteorites_df[target]

##### Split our data into test and train set and train the model

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

##### Start predicting 

In [48]:
# Predict the landing location for new data
new_data = pd.DataFrame([[2013, 5, 0, 1]], columns=features)  # Example input: year, reclat, reclong
predicted_location = model.predict(new_data)

print("Predicted Landing Location (Latitude, Longitude):")
print(predicted_location)

Predicted Landing Location (Latitude, Longitude):
[[ 19.020422 -62.582298]]
