In [290]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import datetime
import glob
import os

sns.set_context("paper", font_scale=1.6)   

<img src="assets/Province.png" width="550"/>

In [200]:
NOAAIndex = {
        1:24,
        2:25,
        3:5,
        4:6,
        5:27,
        6:23,
        7:26,
        8:7,
        9:11,
        10:13,
        11:14,
        12:15,
        13:16,
        14:17,
        15:18,
        16:19,
        17:21,
        18:22,
        19:8,
        20:9,
        21:10,
        22:1,
        23:3,
        24:2,
        25:4, 
        26:12, #Kiev
        27:20 #Sevastopol
}

In [196]:
def preprocess_raw_data(line):
    if line.__contains__('/'):
        return ""    
    line = line.replace(' ', ',',2)    
    return (line + '\n')

In [476]:
def download_data(directory, index, minYear=1991, maxYear=2020):
    with requests.Session() as sess:
        url = "https://www.star.nesdis.noaa.gov/smcd/emb/vci/VH/get_provinceData.php?country=UKR&provinceID={}&year1={}&year2={}&type=Mean".format(NOAAIndex[index], minYear, maxYear)
        response = sess.get(url)
                
        path = os.path.join(directory, "province-{}.{}.csv".format(index,datetime.datetime.now().strftime("%d-%m-%Y_%H-%M")))
        for file in glob.glob(os.path.join(directory, "province-{}*.csv".format(index))):
            os.remove(file) #delete all previous data
            
        with open(path, 'w') as file:
            file.write("Year,Week,SMN,SMT,VCI,TCI,VHI\n")
            for line in response.iter_lines(chunk_size=512, decode_unicode=True):                                
                file.write(preprocess_raw_data(line)) #write line by line

In [477]:
for i in range(1,28):
    download_data("data", i)
    print("Province {} is downloaded".format(i))

Province 1 is downloaded
Province 2 is downloaded
Province 3 is downloaded
Province 4 is downloaded
Province 5 is downloaded
Province 6 is downloaded
Province 7 is downloaded
Province 8 is downloaded
Province 9 is downloaded
Province 10 is downloaded
Province 11 is downloaded
Province 12 is downloaded
Province 13 is downloaded
Province 14 is downloaded
Province 15 is downloaded
Province 16 is downloaded
Province 17 is downloaded
Province 18 is downloaded
Province 19 is downloaded
Province 20 is downloaded
Province 21 is downloaded
Province 22 is downloaded
Province 23 is downloaded
Province 24 is downloaded
Province 25 is downloaded
Province 26 is downloaded
Province 27 is downloaded


In [456]:
def load_all_data_to_pd(path):
    df = pd.DataFrame(columns=['Year','Week', 'SMN', 'SMT', 'VCI', 'TCI', 'VHI'])
    for file in glob.glob(os.path.join(path,"province-*.csv")):        
        temp = pd.read_csv(file, header=0)
        temp['Province'] = file.split('-')[1].split('.')[0] #specify province
        df = df.append(temp, ignore_index=True)
    
    #change types for memory saving
    df['Province'] = df['Province'].astype("int32")
    df['Year'] = df['Year'].astype("int32")
    df['Week'] = df['Week'].astype("int32")
    
    #change columns order
    columns = []
    columns.extend(df.columns.tolist()[:2])
    columns.append(df.columns.tolist()[-1])
    columns.extend(df.columns.tolist()[2:-1])
    df = df[columns]
    
    return df

In [457]:
df = load_all_data_to_pd("data")
df.sort_values(['Province','Year','Week'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39879 entries, 28063 to 5907
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      39879 non-null  int32  
 1   Week      39879 non-null  int32  
 2   Province  39879 non-null  int32  
 3   SMN       39879 non-null  float64
 4   SMT       39879 non-null  float64
 5   VCI       39879 non-null  float64
 6   TCI       39879 non-null  float64
 7   VHI       39879 non-null  float64
dtypes: float64(5), int32(3)
memory usage: 2.3 MB


In [459]:
df.head(10)

Unnamed: 0,Year,Week,Province,SMN,SMT,VCI,TCI,VHI
28063,1991,1,1,0.05,264.38,40.9,25.43,33.17
28064,1991,2,1,0.056,265.3,47.38,24.65,36.01
28065,1991,3,1,0.059,265.57,50.61,26.04,38.33
28066,1991,4,1,0.056,264.01,46.87,32.92,39.89
28067,1991,5,1,0.051,263.6,39.6,37.46,38.53
28068,1991,6,1,0.046,264.2,32.67,40.78,36.72
28069,1991,7,1,0.043,264.71,27.07,44.13,35.6
28070,1991,8,1,0.042,265.96,24.56,46.61,35.59
28071,1991,9,1,0.045,267.81,24.42,49.11,36.77
28072,1991,10,1,0.048,270.58,21.82,49.05,35.44


In [460]:
df.describe()

Unnamed: 0,Year,Week,Province,SMN,SMT,VCI,TCI,VHI
count,39879.0,39879.0,39879.0,39879.0,39879.0,39879.0,39879.0,39879.0
mean,2005.263372,26.398781,14.0,0.237638,283.064331,54.615915,41.404532,48.009796
std,8.428728,14.996015,7.788979,0.144877,14.390377,20.415539,21.503946,11.419106
min,1991.0,1.0,1.0,-0.005,231.71,0.0,0.01,5.52
25%,1998.0,13.0,7.0,0.1,270.91,39.515,24.58,40.54
50%,2005.0,26.0,14.0,0.221,286.69,55.82,40.09,47.41
75%,2013.0,39.0,21.0,0.368,295.22,70.89,56.385,55.19
max,2020.0,52.0,27.0,0.568,309.95,99.52,100.0,96.69


In [461]:
def vhi_by_year_and_province(df, year, province):    
    return df[(df['Year'] == year) & (df['Province'] == province)]['VHI']

def vhi_by_province(df, province):
    return df[df['Province'] == province]['VHI']

In [462]:
vhi_by_province(df, 1)

28063    33.17
28064    36.01
28065    38.33
28066    39.89
28067    38.53
         ...  
29535    45.19
29536    44.49
29537    43.65
29538    44.27
29539    47.83
Name: VHI, Length: 1477, dtype: float64

In [472]:
vhi_by_year_and_province(df, 2001, 1)

28561    36.50
28562    39.49
28563    43.40
28564    42.84
28565    41.60
28566    41.90
28567    41.91
28568    41.87
28569    43.17
28570    45.12
28571    46.20
28572    48.84
28573    48.75
28574    49.02
28575    51.67
28576    53.22
28577    55.06
28578    60.34
28579    65.01
28580    67.69
28581    69.84
28582    71.31
28583    72.03
28584    71.94
28585    70.20
28586    68.09
28587    66.56
28588    63.59
28589    60.43
28590    55.68
28591    50.90
28592    47.38
28593    43.52
28594    41.63
28595    42.63
28596    44.37
28597    45.74
28598    45.04
28599    46.07
28600    48.23
28601    51.53
28602    56.89
28603    62.87
28604    64.44
28605    63.71
28606    62.88
28607    62.88
28608    62.86
28609    62.81
28610    59.61
28611    57.62
28612    56.18
Name: VHI, dtype: float64

In [464]:
def years_by_VHI_Range_By_Province(df, province, minVHI, maxVHI, procentByYear):
    years = df[(df.Province == province)]['Year'].unique()
    results = []
    for year in years:
        series = vhi_by_year_and_province(df, year, province)
        percent = series[(series >= minVHI) & (series <= maxVHI)].count() * 100 / series.count()
        if(percent >= procentByYear):
            results.append(year)
    return results

In [465]:
years_by_VHI_Range_By_Province(df, 1, 0, 14, 7)

[2000]

In [466]:
years_by_VHI_Range_By_Province(df, 1, 15, 35, 7)

[1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1999,
 2000,
 2003,
 2007,
 2009,
 2012,
 2015,
 2016,
 2019]