In [11]:
import datetime
import pandas as pd
import ast
import numpy as np
from scipy.spatial import distance
from tslearn.metrics import dtw,lcss

In [12]:
def advancedfilters(cube,minPopulation,continentCheck,startDate,endDate):
    cube.fillna(0,inplace=True)
    dropped = []
    for i in range(1,len(cube.columns)):
        if(cube.iloc[-1][i] not in continentCheck or int(float(cube.iloc[-2][i])) <= minPopulation):
            dropped.append(cube.columns[i])
    cube.drop(columns=dropped,inplace=True)        
    cube.drop(cube.tail(2).index,inplace=True)
    cube.set_index(cube.columns[0],inplace=True)
    cube.index = pd.to_datetime(cube.index)
    cube = cube.loc[startDate:endDate]
    cube.index.names = ['Date']
    return cube

def slicer(cube,indicator):
    for i in cube.columns:
        for j in range(len(cube)):
            try:
                cube[i].iloc[j] = ast.literal_eval(cube[i].iloc[j])[indicator]
            except:
                continue
    return cube

def time_series(a,period):
    count = period - 1 
    temp_lst = []
    master_lst = []
    column_names = []
    all_lst = []
    for i in a.columns: 
        for k in range(count,len(a)):
            while count >= 0: 
                temp_lst.append(a[i][k-count])
                count = count-1
            master_lst.append(temp_lst)
            column_names.append(a.index[k])
            count = period-1
            temp_lst = []
        column_label = i 
        temp_sr = pd.Series(master_lst,column_names)
        temp_df = pd.DataFrame(data = temp_sr,columns=[column_label])
        all_lst.append(temp_df)
   
        master_lst = []
        column_names = []
        master = pd.concat(all_lst,axis=1)
    return master


def distfunc(target,comp,method):
    target = np.array(target)
    comp = np.array(comp)
    if method == 'euclidean': 
        return distance.euclidean(target,comp)
    if method == 'manhattan':
        return distance.manhattan(target,comp) 
    if method == 'chebyshev':
        return distance.chebyshev(target,comp)
    if method == 'dtw':
        return dtw(target,comp)
    if method == "lcs":
        return 1-lcss(target,comp)
    
def ranker(cube,target_country,target_date,method,top_n):
    target_identifier = target_country + ' ' + datetime.datetime.strftime(target_date,"%Y-%m-%d")
    identifier = []
    comp_values = []
    result =[]
    for i in cube.columns:
        for j in cube.index:
            if i != target_country:
                identifier.append(i + ' ' + datetime.datetime.strftime(j,"%Y-%m-%d"))
                comp_values.append(distfunc(cube[target_country][target_date],cube[i][j],method))

    for i in np.argsort(comp_values)[:top_n].tolist():
        result.append(identifier[i])
    
    return result

def firstRunOutput(cube,targetCountry,firstDate,lastDate,indicator,method,numberOfResults,minPopulation,startDate,endDate,continentCheck):
    cube_filtered = advancedfilters(cube,minPopulation,continentCheck,startDate,endDate)
    sliced = slicer(cube_filtered,indicator)
    sliced = time_series(sliced,(lastDate-firstDate).days)
    result = ranker(sliced,targetCountry,datetime.datetime(lastDate.year,lastDate.month,lastDate.day),method,numberOfResults)
    master_dict = dict()
    for i in result:
        vec = sliced[i.split()[0]][i.split()[1]]
        date = pd.date_range(end=i.split()[1],start=datetime.datetime.strptime(i.split()[1],"%Y-%m-%d") - datetime.timedelta(days=(lastDate-firstDate).days-1))
        dict_sample = dict(zip(date,vec))
        master_dict[i]=dict_sample
    return master_dict

In [13]:
###input sample from the front end###

targetCountry =  "Germany"
firstDate = datetime.date(2021,3,5)
lastDate =  datetime.date(2021,3,26) 
indicator =  "biweekly_cases_per_million"
method  = "dtw"

numberOfResults = 10
minPopulation = 500000 
startDate = datetime.date(2021,1,1) 
endDate = datetime.date(2021,8,1) 
continentCheck = ["Europe"]


In [25]:
cube = pd.read_csv('cube.csv',parse_dates=True)
cube["United Kingdom"].iloc[1]

"{'total_cases': 2662703.0, 'new_cases': 55157.0, 'total_deaths': 75137.0, 'new_deaths': 455.0, 'total_cases_per_million': 39223.151, 'new_cases_per_million': 812.494, 'total_deaths_per_million': 1106.811, 'new_deaths_per_million': 6.702, 'icu_patients': 2420.0, 'icu_patients_per_million': 35.648, 'hosp_patients': 29033.0, 'hosp_patients_per_million': 427.673, 'weekly_icu_admissions': 0.0, 'weekly_icu_admissions_per_million': 0.0, 'weekly_hosp_admissions': 23064.0, 'weekly_hosp_admissions_per_million': 339.746, 'new_tests': 416962.0, 'total_tests': 46247813.0, 'total_tests_per_thousand': 681.257, 'new_tests_per_thousand': 6.142, 'positive_rate': 0.132, 'tests_per_case': 7.6, 'tests_units': 'tests performed', 'total_vaccinations': 1380430.0, 'people_vaccinated': 1380430.0, 'people_fully_vaccinated': 0.0, 'new_vaccinations': 0.0, 'total_vaccinations_per_hundred': 2.03, 'people_vaccinated_per_hundred': 2.03, 'people_fully_vaccinated_per_hundred': 0.0, 'stringency_index': 79.63, 'weekly_ca

In [21]:
out = firstRunOutput(cube,targetCountry,firstDate,lastDate,indicator,method,numberOfResults,minPopulation,startDate,endDate,continentCheck)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [22]:
out

{'Bulgaria 2021-02-28': {Timestamp('2021-02-08 00:00:00', freq='D'): 1332.672,
  Timestamp('2021-02-09 00:00:00', freq='D'): 1388.0810000000001,
  Timestamp('2021-02-10 00:00:00', freq='D'): 1433.415,
  Timestamp('2021-02-11 00:00:00', freq='D'): 1506.8120000000001,
  Timestamp('2021-02-12 00:00:00', freq='D'): 1545.8139999999999,
  Timestamp('2021-02-13 00:00:00', freq='D'): 1568.409,
  Timestamp('2021-02-14 00:00:00', freq='D'): 1573.158,
  Timestamp('2021-02-15 00:00:00', freq='D'): 1607.122,
  Timestamp('2021-02-16 00:00:00', freq='D'): 1662.0980000000002,
  Timestamp('2021-02-17 00:00:00', freq='D'): 1737.943,
  Timestamp('2021-02-18 00:00:00', freq='D'): 1767.1580000000001,
  Timestamp('2021-02-19 00:00:00', freq='D'): 1846.7440000000001,
  Timestamp('2021-02-20 00:00:00', freq='D'): 1854.66,
  Timestamp('2021-02-21 00:00:00', freq='D'): 1861.1360000000002,
  Timestamp('2021-02-22 00:00:00', freq='D'): 1977.708,
  Timestamp('2021-02-23 00:00:00', freq='D'): 2062.331,
  Timestamp(