In [91]:
"""
This is a boilerplate pipeline 'data_processing'
generated using Kedro 0.18.3
"""
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

"""
UTILITY FUNCTIONS
"""

'\nUTILITY FUNCTIONS\n'

In [92]:
"""
UTILITY FUNCTIONS
"""
# This function calculates the population for each grunnkrets
# Returns a df with grunnkretsID in the first column and population_count in the second column


def population(dataset_age):
    age_df = dataset_age[(dataset_age["year"] == 2016)]
    population = age_df.drop(["grunnkrets_id", "year"], axis=1).sum(axis=1)
    age_df["population_count"] = population
    return age_df[["grunnkrets_id", "population_count"]]

# This function calculates the population in a district or municipality, by setting grouping_elemnt either to the district_name or municipality_name


def population_grouped(data_age, data_geography, grouping_element):
    age_df = population(data_age)
    geography_df = data_geography[data_geography["year"] == 2016]
    population_df = age_df.merge(geography_df, how="left", on="grunnkrets_id")
    grouped_df = population_df.groupby([grouping_element], as_index=False)[
        "population_count"].sum()
    return grouped_df

# This function calculates the density (population/area_km2) for the chosen grouping_element


def population_density(age_df, geo_df, grouping_element):
    age_data = population(age_df)
    geo_df = geo_df[geo_df["year"] == 2016]
    combined_df = age_data.merge(geo_df, how="left", on="grunnkrets_id")
    density_df = combined_df.groupby([grouping_element], as_index=False)[
        ["population_count", "area_km2"]].sum()
    density_df["density"] = density_df["population_count"] / \
        density_df["area_km2"]
    return density_df

# This function checks wether or not a store is part of a mall or not


def is_mall(stores_df):
    df = stores_df.copy()
    df["is_mall"] = df["mall_name"].notna()
    return df[["store_id", "mall_name", "is_mall"]]

# This function checks wether or not a store is part of a chain or not


def is_chain(stores_df):
    df = stores_df.copy()
    df["is_chain"] = df["chain_name"].notna()
    return df[["store_id", "chain_name", "is_chain"]]

# This function calculates the population count per number of stores in a geographic region


def population_per_store(age_df, geo_df, stores_df, grouping_element):
    new_geo_df = geo_df[geo_df["year"] == 2016]
    pop_gk = population(age_df)
    pop_df = population_grouped(age_df, geo_df, grouping_element)
    combined_df = pop_gk.merge(stores_df, how="left", on="grunnkrets_id").merge(
        new_geo_df, how="left", on="grunnkrets_id")
    grouped_df = combined_df.groupby([grouping_element], as_index=False)[
        "store_id"].count()
    pop_per_store_df = grouped_df.merge(
        pop_df, how="inner", on=grouping_element)
    pop_per_store_df["population_per_num_stores"] = pop_per_store_df["population_count"] / \
        pop_per_store_df["store_id"]
    pop_per_store_df.rename(columns={"store_id": "num_stores"}, inplace=True)
    new_pop_per_store_df = pop_per_store_df.replace([np.inf, -np.inf], 0)
    return new_pop_per_store_df

#This function do the same as population_per_store but can also filter on store types
def population_per_store_types(stores_df, plaace_hierarchy, grunnkrets_df, age_df, agg_name, geo_group, store_type_group): 
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]
    num_stores_types_by_geo_group = store_types_count_by_geo_group(stores_df, plaace_hierarchy, grunnkrets_df_2016, agg_name, geo_group, store_type_group)
    pop_grouped_by_geo = population_grouped(age_df, grunnkrets_df_2016, geo_group)
    combined_df = num_stores_types_by_geo_group.merge(pop_grouped_by_geo, how = "left", on = geo_group)
    combined_df["population_per_num_store"] = combined_df["population_count"] / combined_df[agg_name]
    return combined_df

# This function groups the age distrubution (0-90) into 7 buckets with and returns a table which represents the presentages each of these
# buckets corresponds to compared with the total amount of people living in the given geographic region s


def age_distrubution(grunnkrets_age_df, geographic_df, grouping_element):
    age_df = grunnkrets_age_df[grunnkrets_age_df["year"] == 2016]
    age_df1 = age_df.drop(["year"], axis=1)
    age_df1["num_kids"] = age_df1.iloc[:, 1:8].sum(axis=1)
    age_df1["num_kids+"] = age_df1.iloc[:, 8:14].sum(axis=1)
    age_df1["num_youths"] = age_df1.iloc[:, 14: 19].sum(axis=1)
    age_df1["num_youthAdult"] = age_df1.iloc[:, 19:27].sum(axis=1)
    age_df1["num_adult"] = age_df1.iloc[:, 27:37].sum(axis=1)
    age_df1["num_adults+"] = age_df1.iloc[:, 37:62].sum(axis=1)
    age_df1["num_pensinors"] = age_df1.iloc[:, 62:92].sum(axis=1)

    age_df2 = age_df1[["grunnkrets_id", "num_kids", "num_kids+", "num_youths",
                       "num_youthAdult", "num_adult", "num_adults+", "num_pensinors"]]

    pop_df = population(grunnkrets_age_df)
    geo_df = geographic_df[geographic_df["year"] == 2016]
    new_geo_df = geo_df.drop(["geometry", "area_km2", "year"], axis=1)
    combined_df = age_df2.merge(pop_df, how="inner", on="grunnkrets_id").merge(
        new_geo_df, how="inner", on="grunnkrets_id")
    list_columns = ["num_kids", "num_kids+", "num_youths",
                    "num_youthAdult", "num_adult", "num_adults+", "num_pensinors"]
    combined_df2 = combined_df.groupby([grouping_element], as_index=False)[
        list_columns].sum()

    pop_gk = population_grouped(
        grunnkrets_age_df, geographic_df, grouping_element)
    new_df = combined_df2.merge(pop_gk, how="inner", on=grouping_element)

    new_df["kids_%"] = new_df["num_kids"] / new_df["population_count"]
    new_df["kids+_%"] = new_df["num_kids+"] / new_df["population_count"]
    new_df["youths_%"] = new_df["num_youths"] / new_df["population_count"]
    new_df["youthAdult_%"] = new_df["num_youthAdult"] / \
        new_df["population_count"]
    new_df["adult_%"] = new_df["num_adult"] / new_df["population_count"]
    new_df["adults+_%"] = new_df["num_adults+"] / new_df["population_count"]
    new_df["pensinors_%"] = new_df["num_pensinors"] / \
        new_df["population_count"]

    age_dist_df = new_df.drop(["population_count"], axis=1)
    # if (grouping_element == "grunnkrets_id"):
    # return new_df[["grunnkrets_id", "kids_%", "kids+_%", "youths_%", "youthAdult_%", "adult_%", "adults+_%", "pensinors_%" ]]
    # else:
    # return new_df[[grouping_element, "kids_%", "kids+_%", "youths_%", "youthAdult_%", "adult_%", "adults+_%", "pensinors_%" ]]

    return age_dist_df

# This function calculates the total amount of household types based on a geographic area


def household_type_distrubution(geographic_df, household_df, grouping_element):
    house_df = household_df[household_df["year"] == 2016]
    geo_df = geographic_df[geographic_df["year"] == 2016]
    combined_df = geo_df.merge(house_df, how="inner", on="grunnkrets_id")

    list_columns = ["couple_children_0_to_5_years", "couple_children_18_or_above", "couple_children_6_to_17_years",
                    "couple_without_children", "single_parent_children_0_to_5_years", "single_parent_children_18_or_above",
                    "single_parent_children_6_to_17_years", "singles"]

    grouped_df = combined_df.groupby([grouping_element], as_index=False)[
        list_columns].sum()
    grouped_df["tot_pop_count"] = grouped_df.iloc[:, 1:].sum(axis=1)

    grouped_df["%_dist_of_couple_children_0_to_5_years"] = grouped_df["couple_children_0_to_5_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_couple_children_18_or_above"] = grouped_df["couple_children_18_or_above"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_couple_children_6_to_17_years"] = grouped_df["couple_children_6_to_17_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_couple_without_children"] = grouped_df["couple_without_children"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_single_parent_children_0_to_5_years"] = grouped_df["single_parent_children_0_to_5_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_single_parent_children_18_or_above"] = grouped_df["single_parent_children_18_or_above"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_single_parent_children_6_to_17_years"] = grouped_df["single_parent_children_6_to_17_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_singles"] = grouped_df["singles"] / \
        grouped_df["tot_pop_count"]

    returned_df = grouped_df.drop(["tot_pop_count"], axis=1)
    return returned_df


# Simens functions
def average_revenue_of_chain(dataset_stores):
    "Average revenue of chains in datasett"
    dataset_stores = dataset_stores[(dataset_stores["year"] == 2016)]
    return dataset_stores.groupby(['chain_name'])['revenue'].mean()


def average_revenue_of_mall(dataset_stores):
    "Average revenue of malls in dataset"
    dataset_stores = dataset_stores[(dataset_stores["year"] == 2016)]
    return dataset_stores.groupby(['mall_name'])['revenue'].mean()


def mean_income_per_capita(dataset_age, dataset_income):
    "mean income per capita per grunnkrets"
    age_df = population(dataset_age)
    income_df = dataset_income[dataset_income["year"] == 2016]
    age_and_income_df = age_df.merge(income_df, how='left', on='grunnkrets_id')
    mean_income = age_and_income_df.drop(['year', 'singles', 'couple_without_children',
                                         'couple_with_children', 'other_households', 'single_parent_with_children'], axis=1)
    mean_income['mean_income'] = mean_income['all_households'] / \
        mean_income['population_count']
    mean_income = mean_income.drop(['all_households'], axis=1)

    return mean_income


def mean_income_per_capita_grouped(dataset_age, dataset_income, dataset_geography, grouping_element):
    # gets data from mean_income_per_capita functino
    data_mean_income = mean_income_per_capita(dataset_age, dataset_income)
    # gets data from geography set and makes sure we only use data for 2016
    geography_df = dataset_geography[dataset_geography["year"] == 2016]
    # gets the data of mean income with the geography data
    mean_income_geo_df = data_mean_income.merge(
        geography_df, how='left', on='grunnkrets_id')
    # sum the number of people based on grouping element
    grouped_population_df = mean_income_geo_df.groupby(
        [grouping_element], as_index=False)["population_count"].sum()
    # merge this with the grunnkrets to see both total population per selected area and grunnkrets
    total_grouped_df = mean_income_geo_df.merge(
        grouped_population_df, how='left', on=grouping_element)
    portion_income_df = total_grouped_df
    # find ration of grunnkrets to total population and multiply this with grunnkrets mean income
    portion_income_df['mean_income'] = total_grouped_df['mean_income'] * \
        total_grouped_df['population_count_x'] / \
        total_grouped_df['population_count_y']
    # add these incomes together, should add up to the total mean income for the selected area
    grouped_income_df = portion_income_df.groupby(
        [grouping_element], as_index=False)["mean_income"].sum()
    return grouped_income_df

# def stores_density_per_location_by_type(stores_df, plaace_df, grunnkrets_df, geo="district_name", lv_desc="lv1_desc"):
#     """
#     Density of stores of the same type in a geographic location.

#     This depends on population
#     """
#     number_of_stores = store_types_count_by_geo_group(
#         stores_df, plaace_df, grunnkrets_df, geo=geo, lv_desc=lv_desc)['count']
#     population = 0
#     return number_of_stores / population

def stores_in_radius(stores_df, plaace_df, radius=0.1, store_type_group=None):
    """
    Number of stores within a given radius. Can also indicate category to filter.
    """
    mat = cdist(stores_df[['lat', 'lon']],
                stores_df[['lat', 'lon']], metric='euclidean')
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=stores_df['store_id'])

    if store_type_group is None:
        count = new_df[(new_df < radius) & (new_df > 0)].count(axis=1)
        return count.to_frame(name="count")

    else:
        combined_df = stores_df.merge(
            plaace_df, how="inner", on="plaace_hierarchy_id")
        test_df = new_df[(new_df < radius) & (new_df > 0)]
        store_count = {}

        for index, row in test_df.iterrows():
            nearby_stores = row.dropna().index.values
            index_type = combined_df[combined_df['store_id']
                                     == index][store_type_group].values[0]
            number_same = combined_df[(combined_df['store_id'].isin(nearby_stores)) & (
                combined_df[store_type_group] == index_type)]['store_id'].count()
            store_count[index] = number_same

        df = pd.DataFrame.from_dict(store_count, orient='index', columns=['count'])
        df.index.rename('store_id', inplace=True)
        return df

def store_types_count_by_geo_group(stores_df, plaace_df, grunnkrets_df, agg_name, geo_group="district_name", store_type_group="lv1_desc"):
    """
    Number of stores of the same type in a geographic location.
    """
    combined_df = stores_df.merge(plaace_df, how="inner", on="plaace_hierarchy_id").merge(
        grunnkrets_df, how="inner", on="grunnkrets_id")
    return combined_df.groupby(by=[geo_group, store_type_group])['store_id'].count().reset_index(name=agg_name)


def store_types_revenue_by_geo_group(stores_df, plaace_df, grunnkrets_df, agg_name, geo_group="district_name", store_type_group="lv1_desc"):
    """
    Total revenue of stores of the same type in a geographic location.
    """
    combined_df = stores_df.merge(plaace_df, how="inner", on="plaace_hierarchy_id").merge(
        grunnkrets_df, how="inner", on="grunnkrets_id")
    return combined_df.groupby(by=[geo_group, store_type_group])['revenue'].sum().reset_index(name=agg_name)

def store_types_all_count_by_geo_groups(stores_df, plaace_df, grunnkrets_df, store_types, geo_groups):
    merged_df = stores_df.merge(grunnkrets_df, how="left", on="grunnkrets_id").merge(plaace_df, how="left", on="plaace_hierarchy_id")
    
    df_list = []
    for geo_group in geo_groups:
        for store_type in store_types:
            df = store_types_count_by_geo_group(stores_df, plaace_df, grunnkrets_df, geo_group=geo_group, agg_name=f"{geo_group}_{store_type}", store_type_group=store_type)
            df_list.append(merged_df.merge(df, how="left", on=[geo_group, store_type])[['store_id', f"{geo_group}_{store_type}"]])
    
    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def store_types_all_revenue_by_geo_groups(stores_df, plaace_df, grunnkrets_df, store_types, geo_groups):
    merged_df = stores_df.merge(grunnkrets_df, how="left", on="grunnkrets_id").merge(plaace_df, how="left", on="plaace_hierarchy_id")
    
    df_list = []
    for geo_group in geo_groups:
        for store_type in store_types:
            df = store_types_revenue_by_geo_group(stores_df, plaace_df, grunnkrets_df, geo_group=geo_group, agg_name=f"{geo_group}_{store_type}", store_type_group=store_type)
            df_list.append(merged_df.merge(df, how="left", on=[geo_group, store_type])[['store_id', f"{geo_group}_{store_type}"]])
    
    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def stores_in_radius_by_type(stores_df, plaace_df, store_types, radius=0.1):
    df_list = []
    df_list.append(stores_in_radius(stores_df, plaace_df, radius=radius).rename(columns={'count':'number_of_all_stores'})) # All stores in radius
    
    for store_type in store_types:
        df = stores_in_radius(stores_df, plaace_df, store_type_group=store_type, radius=radius)
        df.rename(columns={'count': f'number_of_{store_type}'}, inplace=True)
        df_list.append(df)
    
    return pd.concat(df_list, axis=1)

def bus_stops_lat_lon(bus_stops_df):
    """
    Extract latitude and longitude as separate columns.
    """
    bus_stops_df['lng_lat'] = bus_stops_df['geometry'].str.extract(
        r'\((.*?)\)')
    bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(
        " ", 1, expand=True)
    bus_stops_df[['lon', 'lat']] = bus_stops_df[[
        'lon', 'lat']].apply(pd.to_numeric)
    return bus_stops_df[['busstop_id', 'stopplace_type', 'importance_level', 'side_placement', 'geometry', 'lat', 'lon']]

def bus_stops_closest(stores_df, bus_stops_df, importance_level="Regionalt knutepunkt"):
    """
    Id and distance of the closest bus stop to all stores.
    """
    bus_stops_df = bus_stops_df[bus_stops_df['importance_level'] == importance_level]
    mat = cdist(stores_df[['lat', 'lon']],
                bus_stops_df[['lat', 'lon']], metric='euclidean')

    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=bus_stops_df['busstop_id'])

    stores = stores_df.store_id
    closest = new_df.idxmin(axis=1)
    distance = new_df.min(axis=1)

    return pd.DataFrame({'store_id': stores.values, 'closest_bus_stop': closest.values, 'distance': distance.values})

def bus_stops_in_radius(stores_df, bus_stops_df, radius=0.1, importance_level=None):
    """
    Number of bus stops within a given radius. The importance level of bus stops can be specified.
    """
    if importance_level is not None:
        bus_stops_df = bus_stops_df[bus_stops_df['importance_level'] == importance_level]

    mat = cdist(stores_df[['lat', 'lon']],
                bus_stops_df[['lat', 'lon']], metric='euclidean')
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=bus_stops_df['busstop_id'])
    count = pd.DataFrame(new_df[new_df < radius].count(axis=1)).reset_index()
    count.rename(columns={0: 'count'}, inplace=True)
    return count

# Relevant feature engineering functions.
def bus_stops_distance_by_importance(stores_df, bus_stops_df, stop_importance_levels):
    """
    Distance for each store to the closest bus stop of each importance_level
    """
    df_list = []
    for importance_level in stop_importance_levels:
        importance_level_cleaned = importance_level.lower().replace(" ", "_")
        df = bus_stops_closest(stores_df, bus_stops_df, importance_level=importance_level)
        df.rename(columns={'distance': f'distance_to_{importance_level_cleaned}'}, inplace=True)
        df_list.append(df[['store_id', f'distance_to_{importance_level_cleaned}']])

    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def bus_stops_in_radius_by_importance(stores_df, bus_stops_df, stop_importance_levels, radius=0.01):
    """
    Number of bus stops in radius of store for each importance level.
    """
    df_list = []
    df_list.append(bus_stops_in_radius(stores_df, bus_stops_df, radius=radius).rename(columns={'count':'number_of_all_stop_types'})) # All bus stops in radius
    
    for importance_level in stop_importance_levels:
        importance_level_cleaned = importance_level.lower().replace(" ", "_")
        df = bus_stops_in_radius(stores_df, bus_stops_df, importance_level=importance_level, radius=radius)
        df.rename(columns={'count': f'number_of_{importance_level_cleaned}'}, inplace=True)
        df_list.append(df[['store_id', f'number_of_{importance_level_cleaned}']])

    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

In [93]:
#Reading the datasets 
busstops = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/busstops_norway.csv')
grunnkrets_age = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_age_distribution.csv')
grunnkrets_household = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_households_num_persons.csv')
grunnkrets_income = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_income_households.csv')
grunnkrets_norway = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_norway_stripped.csv')
plaace_hierarchy = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/plaace_hierarchy.csv')
sample_submission = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/sample_submission.csv')
stores_extra = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/stores_extra.csv')
stores_test = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/stores_test.csv')
stores_train = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/stores_train.csv')
simens_df = pd.read_csv("C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/notebooks/simen/simens_dataframe-1.csv")

### New functions based on the previus feature_functions that returns store_id as index with all different geo_groups (possibly store_types when appropriate) 

In [94]:
def age_dist_by_geo_group(stores_df, age_df, grunnkrets_df): 
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]  
    combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")

    age_columns = ['num_kids', 'num_kids+', 'num_youths', 'num_youthAdult', 'num_adult',
       'num_adults+', 'num_pensinors', 'kids_%', 'kids+_%', 'youths_%',
       'youthAdult_%', 'adult_%', 'adults+_%', 'pensinors_%']

    df_list = []
    geo_groups = [ "grunnkrets_name", "district_name", "municipality_name"]
    for geo_group in geo_groups: 
      age_dist_df = age_distrubution(age_df, grunnkrets_df, geo_group)
      merged_df = combined_df.merge(age_dist_df, how = "left", on = geo_group)[["store_id"] + age_columns]
      merged_df.set_index("store_id", inplace = True)
      merged_df2 = merged_df.add_prefix(f'{geo_group}_')
      df_list.append(merged_df2)
    
    return pd.concat(df_list, axis = 1)
    
df = age_dist_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [95]:
def household_dist_by_geo_group(stores_df, household_df, grunnkrets_df):
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]  
    combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")
    
    household_colmns = ['couple_children_0_to_5_years', 'couple_children_18_or_above', 'couple_children_6_to_17_years', 'couple_without_children',
       'single_parent_children_0_to_5_years','single_parent_children_18_or_above','single_parent_children_6_to_17_years', 'singles',
       '%_dist_of_couple_children_0_to_5_years','%_dist_of_couple_children_18_or_above','%_dist_of_couple_children_6_to_17_years',
       '%_dist_of_couple_without_children','%_dist_of_single_parent_children_0_to_5_years','%_dist_of_single_parent_children_18_or_above',
       '%_dist_of_single_parent_children_6_to_17_years', '%_dist_of_singles']
       
    df_list = []
    geo_groups = ["grunnkrets_name", "district_name", "municipality_name"]

    for geo_group in geo_groups: 
        household_type_df = household_type_distrubution(grunnkrets_df_2016, household_df, geo_group)
        merged_df = combined_df.merge(household_type_df, how = "left", on = geo_group)[["store_id"] + household_colmns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)
    return pd.concat(df_list, axis = 1)

#household_dist_by_geo_group(stores_train, grunnkrets_household, grunnkrets_norway)


In [96]:
def population_count_grouped_by_geo_group(stores_df, age_df, grunnkrets_df): 
     grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]  
     combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")

     population_columns = ["population_count"]
     df_list = []
     geo_groups = [ "grunnkrets_id", "district_name", "municipality_name"]

     for geo_group in geo_groups: 
          pop_df = population_grouped(age_df, grunnkrets_df, geo_group)
          merged_df = combined_df.merge(pop_df, how = "left", on = geo_group)[["store_id"] + population_columns]
          merged_df.set_index("store_id", inplace = True)
          merged_df2 = merged_df.add_prefix(f'{geo_group}_')
          df_list.append(merged_df2)

     return pd.concat(df_list, axis = 1)

df = population_count_grouped_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)

     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population


In [97]:
def population_density_grouped_by_geo_group(stores_df, age_df, grunnkrets_df):
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]  
    combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")

    pop_density_columns = ["density"]
    df_list = []
    geo_groups = ["grunnkrets_name", "district_name", "municipality_name"]

    for geo_group in geo_groups: 
        pop_df = population_density(age_df, grunnkrets_df, geo_group)
        merged_df = combined_df.merge(pop_df, how = "left", on = geo_group)[["store_id"] + pop_density_columns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)

    return pd.concat(df_list, axis = 1)

#population_density_grouped_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)


In [98]:
def population_per_store_grouped_by_geo_group(stores_df, age_df, grunnkrets_df): 
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]  
    combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")

    columns = ["population_per_num_stores"]
    df_list = []
    geo_groups = ["grunnkrets_id", "district_name", "municipality_name"]

    for geo_group in geo_groups: 
        pop_df = population_per_store(age_df, grunnkrets_df, stores_df, geo_group)
        merged_df = combined_df.merge(pop_df, how = "left", on = geo_group)[["store_id"] + columns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)

    return pd.concat(df_list, axis = 1)

#population_per_store_grouped_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)


In [99]:
def population_per_store_type_grouped_by_geo_groups(stores_df, plaace_df, grunnkrets_df, age_df, geo_groups, store_types, agg_string):
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]
    num_stores_type_by_geo_group = store_types_all_count_by_geo_groups(stores_df, plaace_df, grunnkrets_df_2016, store_types=store_types, geo_groups=geo_groups)
    pop_count_by_geo_group = population_count_grouped_by_geo_group(stores_df, age_df, grunnkrets_df_2016)
    combined_df = num_stores_type_by_geo_group.merge(pop_count_by_geo_group, how ="left", on = "store_id")

    for geo_group in geo_groups: 
        for store_type in store_types: 
            combined_df[f'{geo_group}_{store_type}_' + agg_string] = combined_df[f'{geo_group}_population_count'] /combined_df[f'{geo_group}_{store_type}']

    return combined_df.loc[:, (f'{geo_groups[0]}_{store_types[0]}_' + agg_string) : ]

store =["lv1_desc", "lv2_desc", "lv3_desc", "lv4_desc"]
geo= ["grunnkrets_id","district_name", "municipality_name"]

#population_per_store_type_grouped_by_geo_groups(stores_train, plaace_hierarchy, grunnkrets_norway, grunnkrets_age, geo_groups=geo, store_types=store, agg_string="pop_per_num_store")   

In [100]:
def is_mall_only(stores_df): 
    df = is_mall(stores_df).drop(["mall_name"], axis = 1)
    df.set_index("store_id", inplace=True)
    return df

#is_mall_only(stores_train)

In [101]:
def is_chain_only(stores_df): 
    df = is_chain(stores_df).drop(["chain_name"], axis = 1)
    df.set_index("store_id", inplace=True)
    return df

#is_chain_only(stores_train)

In [102]:
df = simens_df.set_index("store_id")
income_df= df.drop(["Unnamed: 0"], axis = 1)
income_df 

id_and_revenue_df= stores_train[["store_id", "revenue"]]
id_and_revenue_df

Unnamed: 0,store_id,revenue
0,983540538-974187930-44774,17.998
1,987074191-973117734-44755,23.828
2,984890265-981157303-64491,16.099
3,914057442-992924179-126912,9.296
4,913018583-913063538-668469,4.528
...,...,...
12854,915789943-915806929-781991,0.088
12855,917921733-917982368-868081,1.816
12856,911721961-911764474-496764,38.225
12857,914337046-914343372-721294,3.642


# Merging of features into one dataset

In [76]:
store =["lv1_desc", "lv2_desc", "lv3_desc", "lv4_desc"]
geo= ["grunnkrets_id", "district_name", "municipality_name"]
importance_levels = ["Mangler viktighetsnivå", "Standard holdeplass", "Lokalt knutepunkt","Regionalt knutepunkt", "Annen viktig holdeplass", "Nasjonalt knutepunkt"]
grunnkrets_df_2016 = grunnkrets_norway[grunnkrets_norway["year"] == 2016]
bus_df = bus_stops_lat_lon(busstops)
stores_all = [stores_train, stores_extra]
stores_all_df = pd.concat(stores_all)


pop_count_df = population_count_grouped_by_geo_group(stores_all_df, grunnkrets_age, grunnkrets_norway)
age_dist_df = age_dist_by_geo_group(stores_all_df, grunnkrets_age, grunnkrets_norway)
house_hold_dist = household_dist_by_geo_group(stores_all_df, grunnkrets_household, grunnkrets_norway)
pop_per_store_type = population_per_store_type_grouped_by_geo_groups(stores_all_df, plaace_hierarchy, grunnkrets_norway, grunnkrets_age, geo_groups=geo, store_types=store, agg_string="pop_per_num_stores")
pop_density = population_density_grouped_by_geo_group(stores_all_df, grunnkrets_age, grunnkrets_norway)
is_mall_df = is_mall_only(stores_all_df)
is_chain_df = is_chain_only(stores_all_df)
store_types_count = store_types_all_count_by_geo_groups(stores_all_df, plaace_hierarchy, grunnkrets_df_2016, store_types=store, geo_groups=geo)
#store_types_revenue = store_types_all_revenue_by_geo_groups(stores_all_df, plaace_hierarchy, grunnkrets_df_2016, store_types=store, geo_groups=geo)
store_radius = stores_in_radius_by_type(stores_all_df, plaace_hierarchy, store_types=store, radius = 0.1)
busstop_distance = bus_stops_distance_by_importance(stores_all_df, bus_df, stop_importance_levels = importance_levels)
busstop_radius = bus_stops_in_radius_by_importance(stores_all_df, bus_df, stop_importance_levels=importance_levels, radius = 0.1)

df = (pop_count_df.merge(age_dist_df, how = "left", on = "store_id")
    .merge(house_hold_dist, how = "left", on ="store_id")
    .merge(pop_per_store_type, how = "left", on = "store_id")
    .merge(pop_density, how = "left", on  = "store_id")
    .merge(is_mall_df, how = "left", on = "store_id")
    .merge(is_chain_df, how = "left", on = "store_id")
    #.merge(income_df, how = "left", on = "store_id")
    .merge(store_types_count, how ="left", on = "store_id")
    #.merge(store_types_revenue, how = "left", on = "store_id")
    .merge(id_and_revenue_df, how = "left", on = "store_id")
    .merge(store_radius, how = "left", on = "store_id")
    .merge(busstop_distance,how = "left", on = "store_id")
    .merge(busstop_radius, how ="left", on ="store_id" )
)

#df.to_csv("dataset_train_2.csv")




    

MemoryError: Unable to allocate 6.55 GiB for an array with shape (12859, 68395) and data type float64

### Merging of stores_test

In [None]:
store =["lv1_desc", "lv2_desc", "lv3_desc", "lv4_desc"]
geo= ["grunnkrets_id", "district_name", "municipality_name"]
importance_levels = ["Mangler viktighetsnivå", "Standard holdeplass", "Lokalt knutepunkt","Regionalt knutepunkt", "Annen viktig holdeplass", "Nasjonalt knutepunkt"]
grunnkrets_df_2016 = grunnkrets_norway[grunnkrets_norway["year"] == 2016]
bus_df = bus_stops_lat_lon(busstops)

pop_count_df = population_count_grouped_by_geo_group(stores_test, grunnkrets_age, grunnkrets_norway)
age_dist_df = age_dist_by_geo_group(stores_test, grunnkrets_age, grunnkrets_norway)
house_hold_dist = household_dist_by_geo_group(stores_test, grunnkrets_household, grunnkrets_norway)
pop_per_store_type = population_per_store_type_grouped_by_geo_groups(stores_test, plaace_hierarchy, grunnkrets_norway, grunnkrets_age, geo_groups=geo, store_types=store, agg_string="pop_per_num_stores")
pop_density = population_density_grouped_by_geo_group(stores_test, grunnkrets_age, grunnkrets_norway)
is_mall_df = is_mall_only(stores_test)
is_chain_df = is_chain_only(stores_test)
store_types_count = store_types_all_count_by_geo_groups(stores_test, plaace_hierarchy, grunnkrets_df_2016, store_types=store, geo_groups=geo)
#store_types_revenue = store_types_all_revenue_by_geo_groups(stores_test, plaace_hierarchy, grunnkrets_df_2016, store_types=store, geo_groups=geo)
store_radius = stores_in_radius_by_type(stores_test, plaace_hierarchy, store_types=store, radius = 0.1)
busstop_distance = bus_stops_distance_by_importance(stores_test, bus_df, stop_importance_levels = importance_levels)
busstop_radius = bus_stops_in_radius_by_importance(stores_test, bus_df, stop_importance_levels=importance_levels, radius = 0.1)

df = (pop_count_df.merge(age_dist_df, how = "left", on = "store_id")
    .merge(house_hold_dist, how = "left", on ="store_id")
    .merge(pop_per_store_type, how = "left", on = "store_id")
    .merge(pop_density, how = "left", on  = "store_id")
    .merge(is_mall_df, how = "left", on = "store_id")
    .merge(is_chain_df, how = "left", on = "store_id")
    #.merge(income_df, how = "left", on = "store_id")
    .merge(store_types_count, how ="left", on = "store_id")
    #.merge(store_types_revenue, how = "left", on = "store_id")
    #.merge(id_and_revenue_df, how = "left", on = "store_id")
    .merge(store_radius, how = "left", on = "store_id")
    .merge(busstop_distance,how = "left", on = "store_id")
    .merge(busstop_radius, how ="left", on ="store_id" )
)

#df.to_csv("dataset_test_1.csv")



In [126]:

geo_df = grunnkrets_norway[grunnkrets_norway["year"] == 2016]
household_df = grunnkrets_household[grunnkrets_household["year"] == 2016]


merged_df1 = stores_train.merge(geo_df, how = "left", on = "grunnkrets_id")
merged_df2 =  stores_train.merge(household_df, how = "left", on = "grunnkrets_id")

list_df = ["revenue", "year_x", "year_y", "store_name", "plaace_hierarchy_id", "sales_channel_name", "address", "chain_name", "mall_name"]
combined_df = merged_df1.drop(list_df, axis = 1)


test_df = combined_df[combined_df["district_name"].isna()]
test_df2 = merged_df2[merged_df2["singles"].isna()]



split_df = combined_df[combined_df["grunnkrets_name"].notna()]

mat = cdist(test_df[['lat', 'lon']],
                split_df[['lat', 'lon']], metric='euclidean')


new_df = pd.DataFrame(mat, index=test_df['grunnkrets_id'], columns=split_df['grunnkrets_id'])
new_df

grunnkrets_id = test_df.grunnkrets_id
closest = new_df.idxmin(axis=1)
distance = new_df.min(axis=1)

closest_df = pd.DataFrame({"grunnkrets_id": grunnkrets_id, "closest_valid_id" : closest.values, "distance": distance.values})
closest_df2 = pd.DataFrame({"grunnkrets_id": grunnkrets_id, "closest_valid_id" : closest.values})

merged_df1


Unnamed: 0,store_id,year_x,store_name,plaace_hierarchy_id,sales_channel_name,grunnkrets_id,address,lat,lon,chain_name,mall_name,revenue,year_y,grunnkrets_name,district_name,municipality_name,geometry,area_km2
0,983540538-974187930-44774,2016,MCDONALD'S BRAGERNES TORG MAGASINET,1.1.1.0,Hamburger restaurants,6020303,BRAGERNES TORG 13,59.743104,10.204928,MCDONALDS,Magasinet Drammen,17.998,2016.0,Bragernes sentrum 3,Bragernes sentrum,Drammen,"POLYGON((10.2046156903846 59.7447808519649, 10...",0.155779
1,987074191-973117734-44755,2016,MCDONALD'S KLINGENBERGGATA,1.1.1.0,Hamburger restaurants,3010306,,59.913759,10.734031,MCDONALDS,,23.828,2016.0,Sentrum 3 /rode 6,Sentrum 3,Oslo,"POLYGON((10.7303654475615 59.9107195782207, 10...",0.264278
2,984890265-981157303-64491,2016,BURGER KING HØNEFOSS,1.1.1.0,Hamburger restaurants,6050102,KONG RINGS GATE 1,60.164751,10.254656,BURGER KING,Kuben Hønefoss,16.099,2016.0,Sydsiden 2,Hønefoss,Ringerike,"POLYGON((10.2654039198422 60.1639238060368, 10...",0.160152
3,914057442-992924179-126912,2016,BURGER KING GLASSHUSPASSASJEN,1.1.1.0,Hamburger restaurants,18040102,STORGATA 12,67.283669,14.379796,BURGER KING,Glasshuspassasjen,9.296,2016.0,Sentrum 1,Sentrum,Bodø,"POLYGON((14.3800126797167 67.2852351710009, 14...",0.095029
4,913018583-913063538-668469,2016,BURGER KING TILLERTORGET,1.1.1.0,Hamburger restaurants,16017414,,63.358068,10.374832,BURGER KING,Tillertorget,4.528,2016.0,Tiller-Hårstad 14,Tiller-Hårstad,Trondheim,"POLYGON((10.3709720705149 63.3579302939404, 10...",0.251070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12854,915789943-915806929-781991,2016,MEIERIGÅRDEN BRYGGERIUTSALG,2.8.11.2,Beer and soda shop,7010705,THUEGATA 2,59.416276,10.480970,,,0.088,2016.0,Sentrum,Sentrum,Horten,"POLYGON((10.4844343691104 59.4185964815712, 10...",0.291337
12855,917921733-917982368-868081,2016,GULATING ØLUTSALG CC GJØVIK,2.8.11.2,Beer and soda shop,5020406,,60.799991,10.693635,GULATING GRUPPEN,CC Gjøvik,1.816,2016.0,Sentrum 6,Sentrum,Gjøvik,"POLYGON((10.6971768573538 60.7988198867219, 10...",0.137188
12856,911721961-911764474-496764,2016,GULATING ØLUTSALG STRØMMEN,2.8.11.2,Beer and soda shop,2310803,STØPERIVEIEN 6,59.946562,11.007659,GULATING GRUPPEN,Strømmen Storsenter,38.225,2016.0,Stalsberg 3,Stalsberg,Skedsmo,"POLYGON((11.0040997448376 59.9483583629928, 11...",0.123431
12857,914337046-914343372-721294,2016,DET GODE BRYGG,2.8.11.2,Beer and soda shop,11020113,VÅGSGATA 16,58.850261,5.735674,,Bystasjonen,3.642,2016.0,Vågsgate,Sentrum,Sandnes,"POLYGON((5.7342552469665 58.8492193600012, 5.7...",0.034857


In [None]:

grouped_df = split_df[split_df["grunnkrets_id"].isin(closest.values)]

grouped_df2 = grouped_df.drop_duplicates(subset = ["grunnkrets_id"])

gk_df = test_df[["store_id", "grunnkrets_id"]]

gk_df_new = gk_df.merge(closest_df2, how = "left", on = "grunnkrets_id")
gk_df_new2 = gk_df_new.drop_duplicates(subset = ["store_id"], ignore_index = True)

gk_with_valid_id = gk_df_new2.merge(grouped_df2, how = "inner", left_on="closest_valid_id", right_on = "grunnkrets_id")

completed_df = gk_with_valid_id.drop(["store_id_y", "grunnkrets_id_y"], axis = 1)

finally_completed_df = completed_df.rename(columns = {"store_id_x": "store_id", "grunnkrets_id_x": "grunnkrets_id"})

finally_completed_df 



Unnamed: 0,store_id,grunnkrets_id,closest_valid_id,lat,lon,grunnkrets_name,district_name,municipality_name,geometry,area_km2
0,984369204-917025363-832132,10180213,10180214,58.075417,7.773373,Oftenes/Skarpeid/Hallandsvik,Søgne vestre,Søgne,MULTIPOLYGON(((7.77246614444789 58.07731661974...,0.33978
1,914811937-914894670-745543,9140401,9140405,58.645385,9.126323,Gjeving,Dypvåg,Tvedestrand,"POLYGON((9.12413616304391 58.6482702478331, 9....",0.191693
2,914367417-914437180-724647,8330116,8330102,59.44608,7.85574,Mo,Tokke,Tokke,MULTIPOLYGON(((7.84687329622546 59.45721161937...,1.314911
3,915056679-915080723-753186,18740102,18740103,67.939262,13.088658,Reine,Moskenes,Moskenes,"POLYGON((13.0899502784788 67.926800004445, 13....",0.213508
4,814264262-914427150-724449,14010207,14380209,61.767451,4.877038,Kalvåg,Ytre Bremanger,Bremanger,MULTIPOLYGON(((4.88970633600251 61.77425770724...,0.7509
5,940192315-971704047-56153,17420109,17420107,64.456134,12.329602,Tømmerås,Grong,Grong,"POLYGON((12.3305316467661 64.4601477872678, 12...",1.143809
6,993880787-988080551-128842,8150105,8150504,58.866786,9.397979,Løkka/Ths.heia,Kragerø sentrum,Kragerø,"POLYGON((9.39731370389271 58.8672505606497, 9....",0.222105
7,995053799-995066181-343704,5450115,5450109,61.130241,8.542016,Nordre Vang,Vang,Vang,MULTIPOLYGON(((8.46746730848971 61.11244896700...,3.769779
8,989222392-918234543-878557,5450115,5450109,61.130241,8.542016,Nordre Vang,Vang,Vang,MULTIPOLYGON(((8.46746730848971 61.11244896700...,3.769779
9,915479154-915538045-771279,18360101,18360113,66.585932,12.650201,Nordnesøy,Rødøy,Rødøy,MULTIPOLYGON(((12.6539608127019 66.58994633186...,0.39739
