In [2]:
"""
This is a boilerplate pipeline 'data_processing'
generated using Kedro 0.18.3
"""
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

"""
UTILITY FUNCTIONS
"""

'\nUTILITY FUNCTIONS\n'

In [46]:
"""
UTILITY FUNCTIONS
"""
# This function calculates the population for each grunnkrets
# Returns a df with grunnkretsID in the first column and population_count in the second column


def population(dataset_age):
    age_df = dataset_age[(dataset_age["year"] == 2016)]
    population = age_df.drop(["grunnkrets_id", "year"], axis=1).sum(axis=1)
    age_df["population_count"] = population
    return age_df[["grunnkrets_id", "population_count"]]

# This function calculates the population in a district or municipality, by setting grouping_elemnt either to the district_name or municipality_name


def population_grouped(data_age, data_geography, grouping_element):
    age_df = population(data_age)
    geography_df = data_geography[data_geography["year"] == 2016]
    population_df = age_df.merge(geography_df, how="left", on="grunnkrets_id")
    grouped_df = population_df.groupby([grouping_element], as_index=False)[
        "population_count"].sum()
    return grouped_df

# This function calculates the density (population/area_km2) for the chosen grouping_element


def population_density(age_df, geo_df, grouping_element):
    age_data = population(age_df)
    geo_df = geo_df[geo_df["year"] == 2016]
    combined_df = age_data.merge(geo_df, how="left", on="grunnkrets_id")
    density_df = combined_df.groupby([grouping_element], as_index=False)[
        ["population_count", "area_km2"]].sum()
    density_df["density"] = density_df["population_count"] / \
        density_df["area_km2"]
    return density_df

# This function checks wether or not a store is part of a mall or not


def is_mall(stores_df):
    df = stores_df.copy()
    df["is_mall"] = df["mall_name"].notna()
    return df[["store_id", "mall_name", "is_mall"]]

# This function checks wether or not a store is part of a chain or not


def is_chain(stores_df):
    df = stores_df.copy()
    df["is_chain"] = df["chain_name"].notna()
    return df[["store_id", "chain_name", "is_chain"]]

# This function calculates the population count per number of stores in a geographic region


def population_per_store(age_df, geo_df, stores_df, grouping_element):
    new_geo_df = geo_df[geo_df["year"] == 2016]
    pop_gk = population(age_df)
    pop_df = population_grouped(age_df, geo_df, grouping_element)
    combined_df = pop_gk.merge(stores_df, how="left", on="grunnkrets_id").merge(
        new_geo_df, how="left", on="grunnkrets_id")
    grouped_df = combined_df.groupby([grouping_element], as_index=False)[
        "store_id"].count()
    pop_per_store_df = grouped_df.merge(
        pop_df, how="inner", on=grouping_element)
    pop_per_store_df["population_per_num_stores"] = pop_per_store_df["population_count"] / \
        pop_per_store_df["store_id"]
    pop_per_store_df.rename(columns={"store_id": "num_stores"}, inplace=True)
    new_pop_per_store_df = pop_per_store_df.replace([np.inf, -np.inf], 0)
    return new_pop_per_store_df

#This function do the same as population_per_store but can also filter on store types
def population_per_store_types(stores_df, plaace_hierarchy, grunnkrets_df, age_df, agg_name, geo_group, store_type_group): 
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]
    num_stores_types_by_geo_group = store_types_count_by_geo_group(stores_df, plaace_hierarchy, grunnkrets_df_2016, agg_name, geo_group, store_type_group)
    pop_grouped_by_geo = population_grouped(age_df, grunnkrets_df_2016, geo_group)
    combined_df = num_stores_types_by_geo_group.merge(pop_grouped_by_geo, how = "left", on = geo_group)
    combined_df["population_per_num_store"] = combined_df["population_count"] / combined_df[agg_name]
    return combined_df

# This function groups the age distrubution (0-90) into 7 buckets with and returns a table which represents the presentages each of these
# buckets corresponds to compared with the total amount of people living in the given geographic region s


def age_distrubution(grunnkrets_age_df, geographic_df, grouping_element):
    age_df = grunnkrets_age_df[grunnkrets_age_df["year"] == 2016]
    age_df1 = age_df.drop(["year"], axis=1)
    age_df1["num_kids"] = age_df1.iloc[:, 1:8].sum(axis=1)
    age_df1["num_kids+"] = age_df1.iloc[:, 8:14].sum(axis=1)
    age_df1["num_youths"] = age_df1.iloc[:, 14: 19].sum(axis=1)
    age_df1["num_youthAdult"] = age_df1.iloc[:, 19:27].sum(axis=1)
    age_df1["num_adult"] = age_df1.iloc[:, 27:37].sum(axis=1)
    age_df1["num_adults+"] = age_df1.iloc[:, 37:62].sum(axis=1)
    age_df1["num_pensinors"] = age_df1.iloc[:, 62:92].sum(axis=1)

    age_df2 = age_df1[["grunnkrets_id", "num_kids", "num_kids+", "num_youths",
                       "num_youthAdult", "num_adult", "num_adults+", "num_pensinors"]]

    pop_df = population(grunnkrets_age_df)
    geo_df = geographic_df[geographic_df["year"] == 2016]
    new_geo_df = geo_df.drop(["geometry", "area_km2", "year"], axis=1)
    combined_df = age_df2.merge(pop_df, how="inner", on="grunnkrets_id").merge(
        new_geo_df, how="inner", on="grunnkrets_id")
    list_columns = ["num_kids", "num_kids+", "num_youths",
                    "num_youthAdult", "num_adult", "num_adults+", "num_pensinors"]
    combined_df2 = combined_df.groupby([grouping_element], as_index=False)[
        list_columns].sum()

    pop_gk = population_grouped(
        grunnkrets_age_df, geographic_df, grouping_element)
    new_df = combined_df2.merge(pop_gk, how="inner", on=grouping_element)

    new_df["kids_%"] = new_df["num_kids"] / new_df["population_count"]
    new_df["kids+_%"] = new_df["num_kids+"] / new_df["population_count"]
    new_df["youths_%"] = new_df["num_youths"] / new_df["population_count"]
    new_df["youthAdult_%"] = new_df["num_youthAdult"] / \
        new_df["population_count"]
    new_df["adult_%"] = new_df["num_adult"] / new_df["population_count"]
    new_df["adults+_%"] = new_df["num_adults+"] / new_df["population_count"]
    new_df["pensinors_%"] = new_df["num_pensinors"] / \
        new_df["population_count"]

    age_dist_df = new_df.drop(["population_count"], axis=1)
    # if (grouping_element == "grunnkrets_id"):
    # return new_df[["grunnkrets_id", "kids_%", "kids+_%", "youths_%", "youthAdult_%", "adult_%", "adults+_%", "pensinors_%" ]]
    # else:
    # return new_df[[grouping_element, "kids_%", "kids+_%", "youths_%", "youthAdult_%", "adult_%", "adults+_%", "pensinors_%" ]]

    return age_dist_df

# This function calculates the total amount of household types based on a geographic area


def household_type_distrubution(geographic_df, household_df, grouping_element):
    house_df = household_df[household_df["year"] == 2016]
    geo_df = geographic_df[geographic_df["year"] == 2016]
    combined_df = geo_df.merge(house_df, how="inner", on="grunnkrets_id")

    list_columns = ["couple_children_0_to_5_years", "couple_children_18_or_above", "couple_children_6_to_17_years",
                    "couple_without_children", "single_parent_children_0_to_5_years", "single_parent_children_18_or_above",
                    "single_parent_children_6_to_17_years", "singles"]

    grouped_df = combined_df.groupby([grouping_element], as_index=False)[
        list_columns].sum()
    grouped_df["tot_pop_count"] = grouped_df.iloc[:, 1:].sum(axis=1)

    grouped_df["%_dist_of_couple_children_0_to_5_years"] = grouped_df["couple_children_0_to_5_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_couple_children_18_or_above"] = grouped_df["couple_children_18_or_above"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_couple_children_6_to_17_years"] = grouped_df["couple_children_6_to_17_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_couple_without_children"] = grouped_df["couple_without_children"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_single_parent_children_0_to_5_years"] = grouped_df["single_parent_children_0_to_5_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_single_parent_children_18_or_above"] = grouped_df["single_parent_children_18_or_above"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_single_parent_children_6_to_17_years"] = grouped_df["single_parent_children_6_to_17_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_singles"] = grouped_df["singles"] / \
        grouped_df["tot_pop_count"]

    returned_df = grouped_df.drop(["tot_pop_count"], axis=1)
    return returned_df


# Simens functions
def average_revenue_of_chain(dataset_stores):
    "Average revenue of chains in datasett"
    dataset_stores = dataset_stores[(dataset_stores["year"] == 2016)]
    return dataset_stores.groupby(['chain_name'])['revenue'].mean()


def average_revenue_of_mall(dataset_stores):
    "Average revenue of malls in dataset"
    dataset_stores = dataset_stores[(dataset_stores["year"] == 2016)]
    return dataset_stores.groupby(['mall_name'])['revenue'].mean()


def mean_income_per_capita(dataset_age, dataset_income):
    "mean income per capita per grunnkrets"
    age_df = population(dataset_age)
    income_df = dataset_income[dataset_income["year"] == 2016]
    age_and_income_df = age_df.merge(income_df, how='left', on='grunnkrets_id')
    mean_income = age_and_income_df.drop(['year', 'singles', 'couple_without_children',
                                         'couple_with_children', 'other_households', 'single_parent_with_children'], axis=1)
    mean_income['mean_income'] = mean_income['all_households'] / \
        mean_income['population_count']
    mean_income = mean_income.drop(['all_households'], axis=1)

    return mean_income


def mean_income_per_capita_grouped(dataset_age, dataset_income, dataset_geography, grouping_element):
    # gets data from mean_income_per_capita functino
    data_mean_income = mean_income_per_capita(dataset_age, dataset_income)
    # gets data from geography set and makes sure we only use data for 2016
    geography_df = dataset_geography[dataset_geography["year"] == 2016]
    # gets the data of mean income with the geography data
    mean_income_geo_df = data_mean_income.merge(
        geography_df, how='left', on='grunnkrets_id')
    # sum the number of people based on grouping element
    grouped_population_df = mean_income_geo_df.groupby(
        [grouping_element], as_index=False)["population_count"].sum()
    # merge this with the grunnkrets to see both total population per selected area and grunnkrets
    total_grouped_df = mean_income_geo_df.merge(
        grouped_population_df, how='left', on=grouping_element)
    portion_income_df = total_grouped_df
    # find ration of grunnkrets to total population and multiply this with grunnkrets mean income
    portion_income_df['mean_income'] = total_grouped_df['mean_income'] * \
        total_grouped_df['population_count_x'] / \
        total_grouped_df['population_count_y']
    # add these incomes together, should add up to the total mean income for the selected area
    grouped_income_df = portion_income_df.groupby(
        [grouping_element], as_index=False)["mean_income"].sum()
    return grouped_income_df

# def stores_density_per_location_by_type(stores_df, plaace_df, grunnkrets_df, geo="district_name", lv_desc="lv1_desc"):
#     """
#     Density of stores of the same type in a geographic location.

#     This depends on population
#     """
#     number_of_stores = store_types_count_by_geo_group(
#         stores_df, plaace_df, grunnkrets_df, geo=geo, lv_desc=lv_desc)['count']
#     population = 0
#     return number_of_stores / population

def stores_in_radius(stores_df, plaace_df, radius=0.1, store_type_group=None):
    """
    Number of stores within a given radius. Can also indicate category to filter.
    """
    mat = cdist(stores_df[['lat', 'lon']],
                stores_df[['lat', 'lon']], metric='euclidean')
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=stores_df['store_id'])

    if store_type_group is None:
        count = new_df[(new_df < radius) & (new_df > 0)].count(axis=1)
        return count.to_frame(name="count")

    else:
        combined_df = stores_df.merge(
            plaace_df, how="inner", on="plaace_hierarchy_id")
        test_df = new_df[(new_df < radius) & (new_df > 0)]
        store_count = {}

        for index, row in test_df.iterrows():
            nearby_stores = row.dropna().index.values
            index_type = combined_df[combined_df['store_id']
                                     == index][store_type_group].values[0]
            number_same = combined_df[(combined_df['store_id'].isin(nearby_stores)) & (
                combined_df[store_type_group] == index_type)]['store_id'].count()
            store_count[index] = number_same

        df = pd.DataFrame.from_dict(store_count, orient='index', columns=['count'])
        df.index.rename('store_id', inplace=True)
        return df

def store_types_count_by_geo_group(stores_df, plaace_df, grunnkrets_df, agg_name, geo_group="district_name", store_type_group="lv1_desc"):
    """
    Number of stores of the same type in a geographic location.
    """
    combined_df = stores_df.merge(plaace_df, how="inner", on="plaace_hierarchy_id").merge(
        grunnkrets_df, how="inner", on="grunnkrets_id")
    return combined_df.groupby(by=[geo_group, store_type_group])['store_id'].count().reset_index(name=agg_name)


def store_types_revenue_by_geo_group(stores_df, plaace_df, grunnkrets_df, agg_name, geo_group="district_name", store_type_group="lv1_desc"):
    """
    Total revenue of stores of the same type in a geographic location.
    """
    combined_df = stores_df.merge(plaace_df, how="inner", on="plaace_hierarchy_id").merge(
        grunnkrets_df, how="inner", on="grunnkrets_id")
    return combined_df.groupby(by=[geo_group, store_type_group])['revenue'].sum().reset_index(name=agg_name)

def store_types_all_count_by_geo_groups(stores_df, plaace_df, grunnkrets_df, store_types, geo_groups):
    merged_df = stores_df.merge(grunnkrets_df, how="left", on="grunnkrets_id").merge(plaace_df, how="left", on="plaace_hierarchy_id")
    
    df_list = []
    for geo_group in geo_groups:
        for store_type in store_types:
            df = store_types_count_by_geo_group(stores_df, plaace_df, grunnkrets_df, geo_group=geo_group, agg_name=f"{geo_group}_{store_type}", store_type_group=store_type)
            df_list.append(merged_df.merge(df, how="left", on=[geo_group, store_type])[['store_id', f"{geo_group}_{store_type}"]])
    
    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def store_types_all_revenue_by_geo_groups(stores_df, plaace_df, grunnkrets_df, store_types, geo_groups):
    merged_df = stores_df.merge(grunnkrets_df, how="left", on="grunnkrets_id").merge(plaace_df, how="left", on="plaace_hierarchy_id")
    
    df_list = []
    for geo_group in geo_groups:
        for store_type in store_types:
            df = store_types_revenue_by_geo_group(stores_df, plaace_df, grunnkrets_df, geo_group=geo_group, agg_name=f"{geo_group}_{store_type}", store_type_group=store_type)
            df_list.append(merged_df.merge(df, how="left", on=[geo_group, store_type])[['store_id', f"{geo_group}_{store_type}"]])
    
    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def stores_in_radius_by_type(stores_df, plaace_df, store_types, radius=0.1):
    df_list = []
    df_list.append(stores_in_radius(stores_df, plaace_df, radius=radius).rename(columns={'count':'number_of_all_stores'})) # All stores in radius
    
    for store_type in store_types:
        df = stores_in_radius(stores_df, plaace_df, store_type_group=store_type, radius=radius)
        df.rename(columns={'count': f'number_of_{store_type}'}, inplace=True)
        df_list.append(df)
    
    return pd.concat(df_list, axis=1)

def bus_stops_lat_lon(bus_stops_df):
    """
    Extract latitude and longitude as separate columns.
    """
    bus_stops_df['lng_lat'] = bus_stops_df['geometry'].str.extract(
        r'\((.*?)\)')
    bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(
        " ", 1, expand=True)
    bus_stops_df[['lon', 'lat']] = bus_stops_df[[
        'lon', 'lat']].apply(pd.to_numeric)
    return bus_stops_df[['busstop_id', 'stopplace_type', 'importance_level', 'side_placement', 'geometry', 'lat', 'lon']]

def bus_stops_closest(stores_df, bus_stops_df, importance_level="Regionalt knutepunkt"):
    """
    Id and distance of the closest bus stop to all stores.
    """
    bus_stops_df = bus_stops_df[bus_stops_df['importance_level'] == importance_level]
    mat = cdist(stores_df[['lat', 'lon']],
                bus_stops_df[['lat', 'lon']], metric='euclidean')

    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=bus_stops_df['busstop_id'])

    stores = stores_df.store_id
    closest = new_df.idxmin(axis=1)
    distance = new_df.min(axis=1)

    return pd.DataFrame({'store_id': stores.values, 'closest_bus_stop': closest.values, 'distance': distance.values})

def bus_stops_in_radius(stores_df, bus_stops_df, radius=0.1, importance_level=None):
    """
    Number of bus stops within a given radius. The importance level of bus stops can be specified.
    """
    if importance_level is not None:
        bus_stops_df = bus_stops_df[bus_stops_df['importance_level'] == importance_level]

    mat = cdist(stores_df[['lat', 'lon']],
                bus_stops_df[['lat', 'lon']], metric='euclidean')
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=bus_stops_df['busstop_id'])
    count = pd.DataFrame(new_df[new_df < radius].count(axis=1)).reset_index()
    count.rename(columns={0: 'count'}, inplace=True)
    return count

# Relevant feature engineering functions.
def bus_stops_distance_by_importance(stores_df, bus_stops_df, stop_importance_levels):
    """
    Distance for each store to the closest bus stop of each importance_level
    """
    df_list = []
    for importance_level in stop_importance_levels:
        importance_level_cleaned = importance_level.lower().replace(" ", "_")
        df = bus_stops_closest(stores_df, bus_stops_df, importance_level=importance_level)
        df.rename(columns={'distance': f'distance_to_{importance_level_cleaned}'}, inplace=True)
        df_list.append(df[['store_id', f'distance_to_{importance_level_cleaned}']])

    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def bus_stops_in_radius_by_importance(stores_df, bus_stops_df, stop_importance_levels, radius=0.01):
    """
    Number of bus stops in radius of store for each importance level.
    """
    df_list = []
    df_list.append(bus_stops_in_radius(stores_df, bus_stops_df, radius=radius).rename(columns={'count':'number_of_all_stop_types'})) # All bus stops in radius
    
    for importance_level in stop_importance_levels:
        importance_level_cleaned = importance_level.lower().replace(" ", "_")
        df = bus_stops_in_radius(stores_df, bus_stops_df, importance_level=importance_level, radius=radius)
        df.rename(columns={'count': f'number_of_{importance_level_cleaned}'}, inplace=True)
        df_list.append(df[['store_id', f'number_of_{importance_level_cleaned}']])

    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

In [4]:
#Reading the datasets 
busstops = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/busstops_norway.csv')
grunnkrets_age = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_age_distribution.csv')
grunnkrets_household = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_households_num_persons.csv')
grunnkrets_income = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_income_households.csv')
grunnkrets_norway = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_norway_stripped.csv')
plaace_hierarchy = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/plaace_hierarchy.csv')
sample_submission = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/sample_submission.csv')
stores_extra = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/stores_extra.csv')
stores_test = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/stores_test.csv')
stores_train = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/stores_train.csv')
simens_df = pd.read_csv("C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/notebooks/simen/simens_dataframe-1.csv")

In [79]:
grunnkrets_norway.iloc[6413]

grunnkrets_id                                                  9140403
year                                                              2016
grunnkrets_name                                                  Eidbu
district_name                                                   Dypvåg
municipality_name                                          Tvedestrand
geometry             POLYGON((8.97738891833203 58.6039662693649, 8....
area_km2                                                      0.210958
Name: 6413, dtype: object

### New functions based on the previus feature_functions that returns store_id as index with all different geo_groups (possibly store_types when appropriate) 

In [10]:
def age_dist_by_geo_group(stores_df, age_df, grunnkrets_df): 
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]  
    combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")

    age_columns = ['num_kids', 'num_kids+', 'num_youths', 'num_youthAdult', 'num_adult',
       'num_adults+', 'num_pensinors', 'kids_%', 'kids+_%', 'youths_%',
       'youthAdult_%', 'adult_%', 'adults+_%', 'pensinors_%']

    df_list = []
    geo_groups = [ "grunnkrets_name", "district_name", "municipality_name"]
    for geo_group in geo_groups: 
      age_dist_df = age_distrubution(age_df, grunnkrets_df, geo_group)
      merged_df = combined_df.merge(age_dist_df, how = "left", on = geo_group)[["store_id"] + age_columns]
      merged_df.set_index("store_id", inplace = True)
      merged_df2 = merged_df.add_prefix(f'{geo_group}_')
      df_list.append(merged_df2)
    
    return pd.concat(df_list, axis = 1)
    
age_dist_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

Unnamed: 0_level_0,grunnkrets_name_num_kids,grunnkrets_name_num_kids+,grunnkrets_name_num_youths,grunnkrets_name_num_youthAdult,grunnkrets_name_num_adult,grunnkrets_name_num_adults+,grunnkrets_name_num_pensinors,grunnkrets_name_kids_%,grunnkrets_name_kids+_%,grunnkrets_name_youths_%,...,municipality_name_num_adult,municipality_name_num_adults+,municipality_name_num_pensinors,municipality_name_kids_%,municipality_name_kids+_%,municipality_name_youths_%,municipality_name_youthAdult_%,municipality_name_adult_%,municipality_name_adults+_%,municipality_name_pensinors_%
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
983540538-974187930-44774,7.0,11.0,5.0,29.0,32.0,51.0,22.0,0.044586,0.070064,0.031847,...,10104.0,21723.0,14453.0,0.081946,0.070752,0.057645,0.097982,0.151009,0.324660,0.216007
987074191-973117734-44755,,,,,,,,,,,...,145667.0,214521.0,104023.0,0.088735,0.062293,0.043686,0.107818,0.218862,0.322314,0.156293
984890265-981157303-64491,26.0,25.0,13.0,58.0,84.0,85.0,81.0,0.069892,0.067204,0.034946,...,3494.0,9919.0,7312.0,0.070198,0.064986,0.057255,0.101669,0.119005,0.337841,0.249046
914057442-992924179-126912,96.0,70.0,81.0,231.0,361.0,634.0,567.0,0.047059,0.034314,0.039706,...,6933.0,16119.0,9322.0,0.083078,0.073225,0.063207,0.115903,0.142323,0.330897,0.191366
913018583-913063538-668469,,,,,,,,,,,...,32303.0,58362.0,33507.0,0.083598,0.067487,0.052496,0.131132,0.173073,0.312692,0.179524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915789943-915806929-781991,887.0,694.0,604.0,1445.0,1999.0,4060.0,4192.0,0.063900,0.049996,0.043513,...,3202.0,9066.0,6391.0,0.075857,0.069889,0.063000,0.103821,0.117968,0.334009,0.235457
917921733-917982368-868081,21.0,11.0,6.0,91.0,116.0,140.0,118.0,0.041750,0.021869,0.011928,...,3575.0,9758.0,7105.0,0.076141,0.067958,0.059269,0.108368,0.120391,0.328608,0.239266
911721961-911764474-496764,80.0,42.0,15.0,165.0,217.0,296.0,302.0,0.071620,0.037601,0.013429,...,6718.0,17764.0,10078.0,0.085019,0.077404,0.066244,0.101721,0.130164,0.344184,0.195265
914337046-914343372-721294,14.0,8.0,0.0,28.0,82.0,68.0,81.0,0.049822,0.028470,0.000000,...,11822.0,24311.0,11473.0,0.104094,0.082311,0.066210,0.106477,0.159157,0.327293,0.154458


In [11]:
def household_dist_by_geo_group(stores_df, household_df, grunnkrets_df):
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]  
    combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")
    
    household_colmns = ['couple_children_0_to_5_years', 'couple_children_18_or_above', 'couple_children_6_to_17_years', 'couple_without_children',
       'single_parent_children_0_to_5_years','single_parent_children_18_or_above','single_parent_children_6_to_17_years', 'singles',
       '%_dist_of_couple_children_0_to_5_years','%_dist_of_couple_children_18_or_above','%_dist_of_couple_children_6_to_17_years',
       '%_dist_of_couple_without_children','%_dist_of_single_parent_children_0_to_5_years','%_dist_of_single_parent_children_18_or_above',
       '%_dist_of_single_parent_children_6_to_17_years', '%_dist_of_singles']
       
    df_list = []
    geo_groups = ["grunnkrets_name", "district_name", "municipality_name"]

    for geo_group in geo_groups: 
        household_type_df = household_type_distrubution(grunnkrets_df_2016, household_df, geo_group)
        merged_df = combined_df.merge(household_type_df, how = "left", on = geo_group)[["store_id"] + household_colmns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)
    return pd.concat(df_list, axis = 1)

household_dist_by_geo_group(stores_train, grunnkrets_household, grunnkrets_norway)


Unnamed: 0_level_0,grunnkrets_name_couple_children_0_to_5_years,grunnkrets_name_couple_children_18_or_above,grunnkrets_name_couple_children_6_to_17_years,grunnkrets_name_couple_without_children,grunnkrets_name_single_parent_children_0_to_5_years,grunnkrets_name_single_parent_children_18_or_above,grunnkrets_name_single_parent_children_6_to_17_years,grunnkrets_name_singles,grunnkrets_name_%_dist_of_couple_children_0_to_5_years,grunnkrets_name_%_dist_of_couple_children_18_or_above,...,municipality_name_single_parent_children_6_to_17_years,municipality_name_singles,municipality_name_%_dist_of_couple_children_0_to_5_years,municipality_name_%_dist_of_couple_children_18_or_above,municipality_name_%_dist_of_couple_children_6_to_17_years,municipality_name_%_dist_of_couple_without_children,municipality_name_%_dist_of_single_parent_children_0_to_5_years,municipality_name_%_dist_of_single_parent_children_18_or_above,municipality_name_%_dist_of_single_parent_children_6_to_17_years,municipality_name_%_dist_of_singles
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
983540538-974187930-44774,20.0,0.0,8.0,28.0,4.0,6.0,11.0,75.0,0.131579,0.000000,...,3311.0,15127.0,0.186963,0.066607,0.195177,0.224918,0.019046,0.028364,0.050088,0.228836
987074191-973117734-44755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.000000,0.000000,...,26785.0,191856.0,0.201397,0.050282,0.155769,0.213432,0.020295,0.023294,0.041105,0.294426
984890265-981157303-64491,45.0,9.0,38.0,84.0,13.0,6.0,12.0,150.0,0.126050,0.025210,...,1383.0,6345.0,0.159872,0.081264,0.190087,0.256692,0.014935,0.030595,0.047703,0.218853
914057442-992924179-126912,206.0,68.0,211.0,524.0,27.0,34.0,79.0,860.0,0.102539,0.033848,...,2408.0,9763.0,0.187809,0.078256,0.216275,0.230354,0.013176,0.024341,0.049420,0.200369
913018583-913063538-668469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.000000,0.000000,...,7101.0,46358.0,0.190190,0.064963,0.189908,0.243646,0.009357,0.022812,0.037076,0.242048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915789943-915806929-781991,1720.0,631.0,1665.0,3432.0,376.0,354.0,623.0,4635.0,0.128014,0.046963,...,1354.0,5512.0,0.170678,0.075886,0.202808,0.252582,0.015192,0.027821,0.050293,0.204740
917921733-917982368-868081,47.0,9.0,26.0,144.0,5.0,7.0,19.0,280.0,0.087523,0.016760,...,1335.0,6365.0,0.174320,0.068225,0.199960,0.263199,0.010714,0.024157,0.044978,0.214447
911721961-911764474-496764,147.0,30.0,76.0,332.0,50.0,30.0,33.0,407.0,0.133032,0.027149,...,2507.0,9825.0,0.187748,0.078660,0.237456,0.212699,0.016971,0.027010,0.048680,0.190777
914337046-914343372-721294,22.0,0.0,7.0,86.0,6.0,0.0,10.0,114.0,0.089796,0.000000,...,2968.0,12149.0,0.242122,0.079840,0.234385,0.197176,0.017077,0.022377,0.040646,0.166377


In [74]:
def population_count_grouped_by_geo_group(stores_df, age_df, grunnkrets_df): 
     grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]  
     combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")

     population_columns = ["population_count"]
     df_list = []
     geo_groups = [ "grunnkrets_id", "district_name", "municipality_name"]

     for geo_group in geo_groups: 
          pop_df = population_grouped(age_df, grunnkrets_df, geo_group)
          merged_df = combined_df.merge(pop_df, how = "left", on = geo_group)[["store_id"] + population_columns]
          merged_df.set_index("store_id", inplace = True)
          merged_df2 = merged_df.add_prefix(f'{geo_group}_')
          df_list.append(merged_df2)

     return pd.concat(df_list, axis = 1)

df = population_count_grouped_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)
df
     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population


Unnamed: 0_level_0,grunnkrets_id_population_count,district_name_population_count,municipality_name_population_count
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
983540538-974187930-44774,157.0,4555.0,66910.0
987074191-973117734-44755,,5115.0,665566.0
984890265-981157303-64491,372.0,3427.0,29360.0
914057442-992924179-126912,474.0,38006.0,48713.0
913018583-913063538-668469,,8298.0,186644.0
...,...,...,...
915789943-915806929-781991,1516.0,38006.0,27143.0
917921733-917982368-868081,503.0,38006.0,29695.0
911721961-911764474-496764,1117.0,7091.0,51612.0
914337046-914343372-721294,281.0,38006.0,74279.0


In [13]:
def population_density_grouped_by_geo_group(stores_df, age_df, grunnkrets_df):
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]  
    combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")

    pop_density_columns = ["density"]
    df_list = []
    geo_groups = ["grunnkrets_name", "district_name", "municipality_name"]

    for geo_group in geo_groups: 
        pop_df = population_density(age_df, grunnkrets_df, geo_group)
        merged_df = combined_df.merge(pop_df, how = "left", on = geo_group)[["store_id"] + pop_density_columns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)

    return pd.concat(df_list, axis = 1)

population_density_grouped_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population


Unnamed: 0_level_0,grunnkrets_name_density,district_name_density,municipality_name_density
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
983540538-974187930-44774,1007.837931,3287.897111,2070.115106
987074191-973117734-44755,,12162.603725,5228.491443
984890265-981157303-64491,2322.789576,2144.987242,249.501400
914057442-992924179-126912,2333.328115,2934.911088,917.428675
913018583-913063538-668469,,1090.123430,1498.719809
...,...,...,...
915789943-915806929-781991,1430.493740,2934.911088,764.202423
917921733-917982368-868081,3666.513032,2934.911088,259.063165
911721961-911764474-496764,9049.611742,3887.203734,1319.760667
914337046-914343372-721294,8061.539773,2934.911088,595.937985


In [38]:
def population_per_store_grouped_by_geo_group(stores_df, age_df, grunnkrets_df): 
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]  
    combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")

    columns = ["population_per_num_stores"]
    df_list = []
    geo_groups = ["grunnkrets_id", "district_name", "municipality_name"]

    for geo_group in geo_groups: 
        pop_df = population_per_store(age_df, grunnkrets_df, stores_df, geo_group)
        merged_df = combined_df.merge(pop_df, how = "left", on = geo_group)[["store_id"] + columns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)

    return pd.concat(df_list, axis = 1)

population_per_store_grouped_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

Unnamed: 0_level_0,grunnkrets_id_population_per_num_stores,district_name_population_per_num_stores,municipality_name_population_per_num_stores
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
983540538-974187930-44774,3.140000,54.226190,317.109005
987074191-973117734-44755,,70.068493,419.122166
984890265-981157303-64491,26.571429,69.938776,362.469136
914057442-992924179-126912,10.085106,64.856655,386.611111
913018583-913063538-668469,,2074.500000,484.789610
...,...,...,...
915789943-915806929-781991,42.111111,64.856655,382.295775
917921733-917982368-868081,26.473684,64.856655,345.290698
911721961-911764474-496764,20.685185,105.835821,344.080000
914337046-914343372-721294,21.615385,64.856655,523.091549


In [75]:
def population_per_store_type_grouped_by_geo_groups(stores_df, plaace_df, grunnkrets_df, age_df, geo_groups, store_types, agg_string):
    grunnkrets_df_2016 = grunnkrets_df[grunnkrets_df["year"] == 2016]
    num_stores_type_by_geo_group = store_types_all_count_by_geo_groups(stores_df, plaace_df, grunnkrets_df_2016, store_types=store_types, geo_groups=geo_groups)
    pop_count_by_geo_group = population_count_grouped_by_geo_group(stores_df, age_df, grunnkrets_df_2016)
    combined_df = num_stores_type_by_geo_group.merge(pop_count_by_geo_group, how ="left", on = "store_id")

    for geo_group in geo_groups: 
        for store_type in store_types: 
            combined_df[f'{geo_group}_{store_type}_' + agg_string] = combined_df[f'{geo_group}_population_count'] /combined_df[f'{geo_group}_{store_type}']

    return combined_df.loc[:, (f'{geo_groups[0]}_{store_types[0]}_' + agg_string) : ]

store =["lv1_desc", "lv2_desc", "lv3_desc", "lv4_desc"]
geo= ["grunnkrets_id","district_name", "municipality_name"]

population_per_store_type_grouped_by_geo_groups(stores_train, plaace_hierarchy, grunnkrets_norway, grunnkrets_age, geo_groups=geo, store_types=store, agg_string="pop_per_num_store")   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population


Unnamed: 0_level_0,grunnkrets_id_lv1_desc_pop_per_num_store,grunnkrets_id_lv2_desc_pop_per_num_store,grunnkrets_id_lv3_desc_pop_per_num_store,grunnkrets_id_lv4_desc_pop_per_num_store,district_name_lv1_desc_pop_per_num_store,district_name_lv2_desc_pop_per_num_store,district_name_lv3_desc_pop_per_num_store,district_name_lv4_desc_pop_per_num_store,municipality_name_lv1_desc_pop_per_num_store,municipality_name_lv2_desc_pop_per_num_store,municipality_name_lv3_desc_pop_per_num_store,municipality_name_lv4_desc_pop_per_num_store
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
983540538-974187930-44774,7.850000,11.214286,157.0,157.0,151.833333,227.750000,4555.000000,4555.000000,1062.063492,1454.565217,66910.0,66910.0
987074191-973117734-44755,,,,,68.200000,91.339286,5115.000000,5115.000000,1129.993209,1440.619048,166391.5,166391.5
984890265-981157303-64491,93.000000,93.000000,372.0,372.0,228.466667,311.545455,3427.000000,3427.000000,1276.521739,1727.058824,29360.0,29360.0
914057442-992924179-126912,33.857143,59.250000,474.0,474.0,219.687861,324.837607,38006.000000,38006.000000,1159.833333,1571.387097,24356.5,24356.5
913018583-913063538-668469,,,,,1037.250000,1659.600000,8298.000000,8298.000000,1392.865672,1904.530612,93322.0,93322.0
...,...,...,...,...,...,...,...,...,...,...,...,...
915789943-915806929-781991,84.222222,505.333333,1516.0,1516.0,147.310078,1027.189189,12668.666667,12668.666667,798.323529,2714.300000,13571.5,13571.5
917921733-917982368-868081,45.727273,503.000000,503.0,503.0,147.310078,1027.189189,12668.666667,12668.666667,707.023810,5939.000000,29695.0,29695.0
911721961-911764474-496764,34.906250,139.625000,1117.0,1117.0,191.648649,709.100000,7091.000000,7091.000000,653.316456,3440.800000,51612.0,51612.0
914337046-914343372-721294,70.250000,140.500000,281.0,281.0,147.310078,1027.189189,12668.666667,12668.666667,781.884211,4951.933333,74279.0,74279.0


In [16]:
def is_mall_only(stores_df): 
    df = is_mall(stores_df).drop(["mall_name"], axis = 1)
    df.set_index("store_id", inplace=True)
    return df

is_mall_only(stores_train)

Unnamed: 0_level_0,is_mall
store_id,Unnamed: 1_level_1
983540538-974187930-44774,True
987074191-973117734-44755,False
984890265-981157303-64491,True
914057442-992924179-126912,True
913018583-913063538-668469,True
...,...
915789943-915806929-781991,False
917921733-917982368-868081,True
911721961-911764474-496764,True
914337046-914343372-721294,True


In [17]:
def is_chain_only(stores_df): 
    df = is_chain(stores_df).drop(["chain_name"], axis = 1)
    df.set_index("store_id", inplace=True)
    return df

is_chain_only(stores_train)

Unnamed: 0_level_0,is_chain
store_id,Unnamed: 1_level_1
983540538-974187930-44774,True
987074191-973117734-44755,True
984890265-981157303-64491,True
914057442-992924179-126912,True
913018583-913063538-668469,True
...,...
915789943-915806929-781991,False
917921733-917982368-868081,True
911721961-911764474-496764,True
914337046-914343372-721294,False


In [21]:
df = simens_df.set_index("store_id")
income_df= df.drop(["Unnamed: 0"], axis = 1)
income_df 

id_and_revenue_df= stores_train[["store_id", "revenue"]]
id_and_revenue_df

Unnamed: 0,store_id,revenue
0,983540538-974187930-44774,17.998
1,987074191-973117734-44755,23.828
2,984890265-981157303-64491,16.099
3,914057442-992924179-126912,9.296
4,913018583-913063538-668469,4.528
...,...,...
12854,915789943-915806929-781991,0.088
12855,917921733-917982368-868081,1.816
12856,911721961-911764474-496764,38.225
12857,914337046-914343372-721294,3.642


# Merging of features into one dataset

In [76]:
store =["lv1_desc", "lv2_desc", "lv3_desc", "lv4_desc"]
geo= ["grunnkrets_id", "district_name", "municipality_name"]
importance_levels = ["Mangler viktighetsnivå", "Standard holdeplass", "Lokalt knutepunkt","Regionalt knutepunkt", "Annen viktig holdeplass", "Nasjonalt knutepunkt"]
grunnkrets_df_2016 = grunnkrets_norway[grunnkrets_norway["year"] == 2016]
bus_df = bus_stops_lat_lon(busstops)

busstop_radius = bus_stops_in_radius_by_importance(stores_train, busstops, stop_importance_levels=importance_levels, radius = 0.1)

pop_count_df = population_count_grouped_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)
age_dist_df = age_dist_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)
house_hold_dist = household_dist_by_geo_group(stores_train, grunnkrets_household, grunnkrets_norway)
pop_per_store_type = population_per_store_type_grouped_by_geo_groups(stores_train, plaace_hierarchy, grunnkrets_norway, grunnkrets_age, geo_groups=geo, store_types=store, agg_string="pop_per_num_stores")
pop_density = population_density_grouped_by_geo_group(stores_train, grunnkrets_age, grunnkrets_norway)
is_mall_df = is_mall_only(stores_train)
is_chain_df = is_chain_only(stores_train)
store_types_count = store_types_all_count_by_geo_groups(stores_train, plaace_hierarchy, grunnkrets_df_2016, store_types=store, geo_groups=geo)
store_types_revenue = store_types_all_revenue_by_geo_groups(stores_train, plaace_hierarchy, grunnkrets_df_2016, store_types=store, geo_groups=geo)
store_radius = stores_in_radius_by_type(stores_train, plaace_hierarchy, store_types=store, radius = 0.1)
busstop_distance = bus_stops_distance_by_importance(stores_train, bus_df, stop_importance_levels = importance_levels)
busstop_radius = bus_stops_in_radius_by_importance(stores_train, bus_df, stop_importance_levels=importance_levels, radius = 0.1)

df = (pop_count_df.merge(age_dist_df, how = "left", on = "store_id")
    .merge(house_hold_dist, how = "left", on ="store_id")
    .merge(pop_per_store_type, how = "left", on = "store_id")
    .merge(pop_density, how = "left", on  = "store_id")
    .merge(is_mall_df, how = "left", on = "store_id")
    .merge(is_chain_df, how = "left", on = "store_id")
    .merge(income_df, how = "left", on = "store_id")
    .merge(store_types_count, how ="left", on = "store_id")
    .merge(store_types_revenue, how = "left", on = "store_id")
    .merge(id_and_revenue_df, how = "left", on = "store_id")
    .merge(store_radius, how = "left", on = "store_id")
    .merge(busstop_distance,how = "left", on = "store_id")
    .merge(busstop_radius, how ="left", on ="store_id" )
)

df.to_csv("dataset_train_2.csv")




    

MemoryError: Unable to allocate 6.55 GiB for an array with shape (12859, 68395) and data type float64