In [None]:
#coding=utf8

import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np

#Q1 Significant earthquakes since 2150 B.C.
def CountEq_LargestEq(path):
    df = pd.read_csv(path, sep="\t")
    #Q1-1
    Sig_Eqs = df.groupby('Country')["Deaths"].sum().sort_values(ascending=False).head(20)
    print("Q1-1: The top 20 countries along with the total number of deaths:\n", Sig_Eqs)

    #Q1-2
    total_number = df[(df['Ms'] > 3.0)].groupby("Year")["Ms"].count()
    plt.figure(figsize=(9, 6))
    plt.xlabel("Year")
    plt.ylabel("Total number of earthquakes (magnitude>3.0 )")
    plt.plot(total_number.keys().tolist(), total_number.tolist(), linestyle='-', marker='o', color='b', markersize=2)
    plt.title("Q1-2: Fig. Time series of total number of earthquakes (magnitude>3.0)")
    plt.show()
    print("It seems the frequency of earthquake increasing with time, but it may be caused by lacking of record in the past or geologically active periods alternate with geological intermittent periods.")

    #Q1-3 (1)
    total_number_of_earthquake = df.groupby('Country')["Year"].count()
    print("Q1-3(1): Total number of earthquakes since 2150 B.C. in a given country:\n", total_number_of_earthquake)

    #Q1-3 (2)
    total_number_country = df.groupby('Country').apply(lambda i: i[i.Ms == i.Ms.max()])\
        .sort_values('Ms',ascending=False)
    print("Q1-3(2): Date and location of the largest earthquake ever happened in every country (Descending)\n",
          total_number_country[["Ms", "Year", "Mo", "Dy", "Hr", "Mn", "Sec", "Location Name", "Latitude", "Longitude"]])

#Q2 Air temperature in Shenzhen during the past 25 years
def monthlyAverage(path):
    df = pd.read_csv(path, usecols=["DATE", "TMP"], low_memory=False)
    month = [i[0][:7] for i in df[["DATE"]].values]
    tem = [int(i[0][:-2]) for i in df[["TMP"]].values]
    df.to_csv(path, index=False)
    data = pd.DataFrame({"Month": month, "TMP": tem}, columns=["Month", "TMP"])
    data = data.groupby('Month')['TMP'].mean()
    plt.figure(figsize=(16, 9))
    ax = plt.gca()
    x_major_locator = plt.MultipleLocator(9)
    ax.xaxis.set_major_locator(x_major_locator)
    plt.xlabel("Month")
    plt.xticks(rotation=45, fontsize=8)
    plt.ylabel("Monthly averaged air temperature (Scaling factor: 10)")
    plt.plot(data.keys().tolist(), data.tolist(), linestyle='-', marker='o', color='b', markersize = '3')
    plt.title("Q2: Fig. Monthly averaged air temperature against the observation time")
    plt.show()
    plt.close()
    print("Excluding outliers, the average temperature shows a cyclical trend.")

#Q3 Global collection of hurricanes
def stormDataAnalyze(path):
    df = pd.read_csv(path,
                     usecols=range(17),
                     skiprows=[1, 2],
                     parse_dates=['ISO_TIME'],
                     date_parser=lambda i: datetime.strptime(i, '%Y-%m-%d %H:%M:%S').date(),
                     na_values=['NOT_NAMED', ' '],
                     low_memory=False,
                     index_col=False)

    #Q3-1
    Hurricane_sort_by_wind = df[["SID", "NAME", "WMO_WIND"]].apply(pd.to_numeric, errors="ignore")
    Hurricane_sort_by_wind = Hurricane_sort_by_wind.groupby('SID', as_index=False, group_keys=False).apply(lambda i: i[i.WMO_WIND == i.WMO_WIND.max()]) \
        .sort_values('WMO_WIND', ascending=False)
    Hurricane_sort_by_wind.drop_duplicates(subset=['SID'], keep='first', inplace=True)
    print("Q3-1: Names (NAME) of the 10 largest hurricanes according to wind speed (WMO_WIND):\n", Hurricane_sort_by_wind.head(10))

    #Q3-2
    x = np.arange(1,21)
    y_height = Hurricane_sort_by_wind["WMO_WIND"].head(20).tolist()
    plt.figure(figsize=(9, 6))
    plt.xlabel("Top 20 of the Strongest Hurricane")
    plt.ylabel("Wind Speed")
    plt.bar(x, y_height)
    plt.title("Q3-2: Fig. Top 20 of the Strongest Hurricanes")
    plt.show()
    plt.close()

    #Q3-3
    datapoints = df[['BASIN', 'SID']]
    datapoints = datapoints.groupby('BASIN')['SID'].count()
    plt.figure(figsize=(9, 6))
    plt.xlabel("Basin")
    plt.ylabel("Number of datapoints")
    plt.bar(datapoints.keys(), datapoints.tolist())
    plt.title("Q3-3: Fig. Number of datapoints by Basin")
    plt.show()
    plt.close()

    #Q3-4
    x,y = df[['LAT']],df[['LON']]
    plt.rcParams['xtick.direction'] = 'in'
    plt.rcParams['ytick.direction'] = 'in'
    plt.hexbin(x, y, gridsize=50, cmap='Greens')
    plt.xlabel("Longitude(E)")
    plt.ylabel("Latitude(N)")
    plt.tick_params(top='on', right='on', which='both')
    plt.title('Q3-4: Fig. Location of datapoints in Latitude and Longitude')
    plt.show()
    plt.close()

    #Q3-5
    data = df[(df['NAME'] == 'MANGKHUT') & (df['SEASON'] >= 2018)][['LAT', 'LON']]
    x,y = data[['LAT']],data[['LON']]
    plt.rcParams['xtick.direction'] = 'in'
    plt.rcParams['ytick.direction'] = 'in'
    plt.scatter(x,y)
    plt.xlabel("Longitude(E)")
    plt.ylabel("Latitude(N)")
    plt.tick_params(top='on', right='on', which='both')
    plt.title('Q3-4: Fig. Typhoon Mangkhut (from 2018) track')
    plt.show()
    plt.close()

    #Q3-6
    data = df[(df['ISO_TIME'] >= datetime(1970, 1, 1)) & ((df['BASIN'] == 'WP') | (df['BASIN'] == 'EP'))]

    #Q3-7
    datapointsByDay = data.groupby('ISO_TIME')['SID'].count()
    plt.figure(figsize=(9, 6))
    plt.xlabel("Date")
    plt.ylabel("Number of datapoints")
    plt.plot(datapointsByDay.keys(), datapointsByDay.tolist(), lw=0.5, linestyle='-', color='b')
    plt.title("Q3-7: Fig. Number of datapoints per day")
    plt.show()
    plt.close()

    # Q3-8
    dayofyear(data)

    #Q3-9
    count = data.loc[:, 'NATURE'].apply(lambda i:0 if i == "NR" or i == "MX" else 1)
    count.name = "COUNT"
    data = pd.concat([data, count], axis=1)
    dailyCounts = data.groupby('ISO_TIME', as_index=False, group_keys=False)['COUNT'].sum()

    #Q3-10
    yearCounts = dailyCounts.resample('1Y', on='ISO_TIME', origin='epoch').sum()
    y_count = yearCounts['COUNT'].tolist()
    x_year = yearCounts['COUNT'].keys().tolist()
    x_year = [int(str(i)[:4]) for i in x_year]
    plt.figure(figsize=(9, 6))
    ax = plt.gca()
    x_major_locator = plt.MultipleLocator(2)
    ax.xaxis.set_major_locator(x_major_locator)
    plt.xlim(1969, 2023)
    plt.xticks(rotation=45, fontsize=7)
    plt.xlabel("Year")
    plt.ylabel("Number")
    plt.plot(x_year, y_count, lw=1, linestyle='-', color='b')
    plt.title("Q3-10: Fig. Number of hurricane(year)")
    plt.show()
    plt.close()
    print("As the figure shows, year of 1971, 1978, 1992, 1994, 2015 stand out as having anomalous hurricane activity")

def dayofyear(data):
    datapointsDay = data['ISO_TIME'].apply(lambda i: (i - datetime(int(str(i)[:4]), 1, 1)).days)
    print("Q3-8: Sequential day number starting with day 1 on January 1st\n", datapointsDay)

#Q4 Explore a data set, I choose ICESat2 dataset for analysis.
def exploreData(path):
    #Q4-1
    df = pd.read_csv(path,
                     parse_dates=['DATE'],
                     na_values={'WSE': 0},
                     low_memory=False,
                     index_col=False)

    #Q4-2
    dataByYear = df.resample('1Y', on='DATE', origin='2018-01-01')['WSE'].mean()
    y_WSE = dataByYear.tolist()
    x_year = dataByYear.keys().tolist()
    x_year = [int(str(i)[:4]) for i in x_year]
    plt.figure(figsize=(9, 6))
    ax = plt.gca()
    x_major_locator = plt.MultipleLocator(1)
    ax.xaxis.set_major_locator(x_major_locator)
    plt.xlim(2017.5, 2021.5)
    plt.xlabel("Year")
    plt.ylabel("Mean of WSE")
    plt.plot(x_year, y_WSE, lw=1, linestyle='-', color='b')
    plt.title("Q4-2: Fig. Mean of WSE(year)")
    plt.show()
    plt.close()

    #Q4-3
    indicators = df.agg({'WSE': [np.mean, np.max, np.median, np.var, np.std], 'SRTM': [np.mean, np.max, np.median, np.var, np.std]})
    print(indicators)
    print("After add the geoid to WSE values, precision of ICESat2 is better than SRTM and mean of water level is increasing by year.")

if __name__=="__main__":
    #Q1 Significant earthquakes since 2150 B.C.
    path = os.path.join(os.getcwd(), "earthquakes-2022-10-24_11-09-34_+0800.tsv")
    CountEq_LargestEq(path)

    #Q2 Air temperature in Shenzhen during the past 25 years
    path = os.path.join(os.getcwd(), "Baoan_Weather_1998_2022.csv")
    monthlyAverage(path)

    #Q3 Global collection of hurricanes
    path = os.path.join(os.getcwd(), "ibtracs.ALL.list.v04r00.csv")
    stormDataAnalyze(path)

    # Q4 Explore a data set
    path = os.path.join(os.getcwd(), "WSE.csv")
    exploreData(path)
