In [1]:
import pandas as pd 
import requests
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [2]:
#Function to fetch table to data
def fetch_data_from_url(url):
    #gets the page content from the url 
    html = requests.get(url).content
    #read the table content of html 
    df_list = pd.read_html(html)
    #targets the first table from the list of table
    df = df_list[0]
    df.index += 1 
    return df

In [3]:
#a better way to do this 
urls = ["http://seismonepal.gov.np/earthquakes/1994",
       "http://seismonepal.gov.np/earthquakes/1995",
       "http://seismonepal.gov.np/earthquakes/1996",
       "http://seismonepal.gov.np/earthquakes/1997",
       "http://seismonepal.gov.np/earthquakes/1998",
       "http://seismonepal.gov.np/earthquakes/1999",
       "http://seismonepal.gov.np/earthquakes/2000",
       "http://seismonepal.gov.np/earthquakes/2001",
       "http://seismonepal.gov.np/earthquakes/2002",
       "http://seismonepal.gov.np/earthquakes/2003",
       "http://seismonepal.gov.np/earthquakes/2004",
       "http://seismonepal.gov.np/earthquakes/2005",
       "http://seismonepal.gov.np/earthquakes/2006",
       "http://seismonepal.gov.np/earthquakes/2007",
       "http://seismonepal.gov.np/earthquakes/2008",
       "http://seismonepal.gov.np/earthquakes/2009",
       "http://seismonepal.gov.np/earthquakes/2010",
       "http://seismonepal.gov.np/earthquakes/2011",
       "http://seismonepal.gov.np/earthquakes/2012",
       "http://seismonepal.gov.np/earthquakes/2013",
       "http://seismonepal.gov.np/earthquakes/2014",
       "http://seismonepal.gov.np/earthquakes/2015",
       "http://seismonepal.gov.np/earthquakes/2016",
       "http://seismonepal.gov.np/earthquakes/2017",
       "http://seismonepal.gov.np/earthquakes/2018",
       "http://seismonepal.gov.np/earthquakes/2019",]

#getting all the data from 1994-2019 in one place     
result = []
for url in urls:
    result.append(fetch_data_from_url(url))

In [415]:
#concatenate all the table from result array to one dataframe
df = pd.concat(result)

In [416]:
df = df.reset_index()

In [417]:
df.columns

Index(['index', 'Date', 'Time', 'Latitude', 'Longitude', 'Magnitude(ML)',
       'Remarks', 'Epicenter', 'Form'],
      dtype='object')

In [418]:
#dropping the un-necessary columns
df = df.drop(['Form','Remarks','index'],axis = 1)

In [419]:
# check data type of each columns
df.columns = ['Date','Time','Lat',"Long","Magnitude","Epicenter"]

In [420]:
df.head()

Unnamed: 0,Date,Time,Lat,Long,Magnitude,Epicenter
0,B.S:2051-8-28 A.D:1994-12-13,Local:11:00UTC:N/A,28.7,82.88,4.6,Rukum
1,B.S:2051-8-27 A.D:1994-12-12,Local:11:00UTC:N/A,29.84,80.69,4.6,Darchula
2,B.S:2051-8-12 A.D:1994-11-27,Local:11:00UTC:N/A,29.72,81.56,4.5,Bajura
3,B.S:2051-8-6 A.D:1994-11-21,Local:11:00UTC:N/A,29.54,81.15,4.2,Bajhang
4,B.S:2051-7-8 A.D:1994-10-24,Local:11:00UTC:N/A,28.92,82.0,4.7,Jajarkot


In [421]:
#change the Date column to a proper format(removing the nepali date)
dates = []
for line in df.Date:
#   Split the two dates given 
    date = line.split()
    date = line.split(":")
    del date[:2]
    dates.append(date)

In [422]:
# Change the Time column to standard format
Times = []
for line in df.Time:
    Time = line.split("Local:")  
    del Time[0]
    for time_line in Time:
        Time = time_line.split('UTC:')
        del Time[1]
#         print(Time)
    Times.append(Time)

In [423]:
#Covert the date and time column to pandas dataframe
Times = pd.DataFrame(Times)
Times.columns =['Time']
Dates = pd.DataFrame(dates)
Dates.columns=['Date']

In [424]:
# Concatenate the date and time dataframe together separated by space
Dates['DateTime'] = Dates["Date"]+ " " + Times["Time"].map(str)

In [425]:
len(Dates)

965

In [426]:
# checking to see if the two dates in the two dataframe match
test = pd.DataFrame({'x':Dates.DateTime, 'y':df.Date})

In [427]:
#Convert the DateTime column to datetime type
Dates['DateTime'] = pd.to_datetime(Dates['DateTime'],  errors='coerce')

In [428]:
# Merge the datetime column to the main dataframe
df['DateTime'] = Dates['DateTime']

In [429]:
#Drop the columns not required
df = df.drop(['Date','Time'],axis = 1)

In [434]:
df.head()

Unnamed: 0,index,Lat,Long,Magnitude,Epicenter,DateTime
0,9,27.75,86.16,5.1,Dolakha,1994-06-25 05:45:00
1,8,29.37,81.52,5.5,Bajura,1994-07-17 11:00:00
2,7,26.09,79.51,6.0,India,1994-08-31 11:00:00
3,6,28.34,87.35,4.8,Sankhuwasabha,1994-09-25 11:00:00
4,5,29.0,82.26,4.6,Jajarkot,1994-10-22 11:00:00


In [431]:
df = df.sort_values(by='DateTime')

In [433]:
df = df.reset_index()

In [447]:
#Check where data is null
print(np.where(pd.isnull(df)))
print(np.where(df.Epicenter == "Kailali"))
print(np.where(df.Epicenter == "Solukhumbu"))
print(np.where(df.Epicenter == "Niseldhar, Baglung"))
df[124:127]

(array([], dtype=int64), array([], dtype=int64))
(array([91, 92, 97]),)
(array([ 24,  37,  42,  50,  51,  89, 151, 243, 360]),)
(array([124, 125, 126]),)


Unnamed: 0,index,Lat,Long,Magnitude,Epicenter,DateTime
139,141,28.58,82.96,4.3,"Niseldhar, Baglung",2004-05-29 11:00:00
140,142,28.55,82.96,4.9,"Niseldhar, Baglung",2004-05-29 11:00:00
141,143,28.59,82.98,4.3,"Niseldhar, Baglung",2004-05-29 11:00:00


In [440]:
# Converted the empty to India because the coordianted were in India, one of the Kailali coordiantes was wrong
df.Epicenter[25]="India"
np.where(df.Epicenter == "Kailali")
df.Long[108] = 80.75
df.Long[167] = 86.842
df.Long[139] = 82.96

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [441]:
#Drop any row with epicenters outside Nepal
check = ['India',"Bangladesh","Bhuj/India","Bhutan","Chamauli/India","Tibet","Bangaladesh","Pithoragarh","Sikkim"
        ,"India (close to Darchula)", "Jhapa-India Border Region"]

for place in check:
    for epicenter in df.Epicenter:
        df = df[df.Epicenter != place]

In [442]:
df.head()


Unnamed: 0,index,Lat,Long,Magnitude,Epicenter,DateTime
0,9,27.75,86.16,5.1,Dolakha,1994-06-25 05:45:00
1,8,29.37,81.52,5.5,Bajura,1994-07-17 11:00:00
3,6,28.34,87.35,4.8,Sankhuwasabha,1994-09-25 11:00:00
4,5,29.0,82.26,4.6,Jajarkot,1994-10-22 11:00:00
5,4,28.92,82.0,4.7,Jajarkot,1994-10-24 11:00:00


In [448]:
df.to_csv('earthquake.csv')

In [445]:
df[139:142]

Unnamed: 0,index,Lat,Long,Magnitude,Epicenter,DateTime
154,171,27.76,86.07,5.0,Dolakha,2005-02-08 11:00:00
155,170,28.27,84.39,5.0,Lamjung,2005-03-05 11:00:00
156,169,28.25,84.39,5.4,Lamjung,2005-03-19 11:00:00
