In [55]:
"""
This is a boilerplate pipeline 'data_processing'
generated using Kedro 0.18.3
"""
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

"""
UTILITY FUNCTIONS
"""

'\nUTILITY FUNCTIONS\n'

In [67]:

def lat_lon_bus_stop(bus_stops_df):
    bus_stops_df['lng_lat'] = bus_stops_df['geometry'].str.extract(
        r'\((.*?)\)')
    bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(
        " ", 1, expand=True)
    bus_stops_df[['lon', 'lat']] = bus_stops_df[[
        'lon', 'lat']].apply(pd.to_numeric)
    return bus_stops_df[['busstop_id', 'stopplace_type', 'importance_level', 'side_placement', 'geometry', 'lat', 'lon']]


def store_type_lookup(stores_df, plaace_df, store_ids, match):
    combined_df = stores_df.merge(
        plaace_df, how="inner", on="plaace_hierarchy_id")
    pass


"""
TRANSFORMS
"""


def store_type_in_dataaset(stores_df, plaace_df, lv_desc="lv1_desc"):
    combined_df = stores_df.merge(
        plaace_df, how="inner", on="plaace_hierarchy_id")
    return combined_df[lv_desc].value_counts().rename_axis(lv_desc).reset_index(name='count')


def stores_per_location_by_type(stores_df, plaace_df, grunnkrets_df, geo="district_name", lv_desc="lv1_desc"):
    """
    Number of stores of the same type in a geographic location.
    """
    combined_df = stores_df.merge(plaace_df, how="inner", on="plaace_hierarchy_id").merge(
        grunnkrets_df, how="inner", on="grunnkrets_id")
    return combined_df.groupby(by=[geo, lv_desc])['store_id'].count().reset_index(name='count')


def stores_revenue_per_location_by_type(stores_df, plaace_df, grunnkrets_df, geo="district_name", lv_desc="lv1_desc"):
    """
    Total revenue of stores of the same type in a geographic location.
    """
    combined_df = stores_df.merge(plaace_df, how="inner", on="plaace_hierarchy_id").merge(
        grunnkrets_df, how="inner", on="grunnkrets_id")
    return combined_df.groupby(by=[geo, lv_desc])['revenue'].sum().reset_index(name='total_revenue')


def stores_density_per_location_by_type(stores_df, plaace_df, grunnkrets_df, geo="district_name", lv_desc="lv1_desc"):
    """
    Density of stores of the same type in a geographic location.

    This depends on population
    """
    number_of_stores = stores_per_location_by_type(
        stores_df, plaace_df, grunnkrets_df, geo=geo, lv_desc=lv_desc)['count']
    population = 0
    return number_of_stores / population


def stores_in_radius(stores_df, plaace_df, radius=0.1, by_type=False, category=None):
    """
    Number of stores within a given radius. Can also indicate category to filter.
    """
    mat = cdist(stores_df[['lat', 'lon']],
                stores_df[['lat', 'lon']], metric='euclidean')
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=stores_df['store_id'])

    if by_type == False:
        count = pd.DataFrame(new_df[(new_df < radius) & (
            new_df > 0)].count(axis=1)).reset_index()
        count.rename(columns={0: 'count'}, inplace=True)
        return count

    else:
        combined_df = stores_df.merge(
            plaace_df, how="inner", on="plaace_hierarchy_id")
        test_df = new_df[(new_df < 0.2) & (new_df > 0)]
        store_count = {}

        for index, row in test_df.iterrows():
            nearby_stores = row.dropna().index.values
            index_type = combined_df[combined_df['store_id']
                                     == index][category].values[0]
            number_same = combined_df[(combined_df['store_id'].isin(nearby_stores)) & (
                combined_df[category] == index_type)]['store_id'].count()
            store_count[index] = number_same

        return pd.DataFrame.from_dict(store_count, orient='index', columns=['count']).reset_index()


def closest_bus_stop_cat(stores_df, bus_stops_df, cat="Regionalt knutepunkt"):
    """
    Id and distance of the closest bus stop to all stores.
    """
    bus_stops_df = bus_stops_df[bus_stops_df['importance_level'] == cat]
    mat = cdist(stores_df[['lat', 'lon']],
                bus_stops_df[['lat', 'lon']], metric='euclidean')

    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=bus_stops_df['busstop_id'])

    stores = stores_df.store_id
    closest = new_df.idxmin(axis=1)
    distance = new_df.min(axis=1)

    return pd.DataFrame({'store_id': stores.values, 'closest_bus_stop': closest.values, 'distance': distance.values})


def bus_stops_in_radius(stores_df, bus_stops_df, radius=0.1, cat=None):
    """
    Number of bus stops within a given radius. The importance level of bus stops can be specified.
    """
    if cat is not None:
        bus_stops_df = bus_stops_df[bus_stops_df['importance_level'] == cat]

    mat = cdist(stores_df[['lat', 'lon']],
                bus_stops_df[['lat', 'lon']], metric='euclidean')
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=bus_stops_df['busstop_id'])
    count = pd.DataFrame(new_df[new_df < radius].count(axis=1)).reset_index()
    count.rename(columns={0: 'count'}, inplace=True)
    return count


# This function calculates the population for each grunnkrets
# Returns a df with grunnkretsID in the first column and population_count in the second column

def population(dataset_age):
    age_df = dataset_age[(dataset_age["year"] == 2016)]
    population = age_df.drop(["grunnkrets_id", "year"], axis=1).sum(axis=1)
    age_df["population_count"] = population
    return age_df[["grunnkrets_id", "population_count"]]

# This function calculates the population in a district or municipality, by setting grouping_elemnt either to the district_name or municipality_name


def population_grouped(data_age, data_geography, grouping_element):
    age_df = population(data_age)
    geography_df = data_geography[data_geography["year"] == 2016]
    population_df = age_df.merge(geography_df, how="left", on="grunnkrets_id")
    grouped_df = population_df.groupby([grouping_element], as_index=False)[
        "population_count"].sum()
    return grouped_df

# This function calculates the density (population/area_km2) for the chosen grouping_element


def population_density(age_df, geo_df, grouping_element):
    age_data = population(age_df)
    geo_df = geo_df[geo_df["year"] == 2016]
    combined_df = age_data.merge(geo_df, how="left", on="grunnkrets_id")
    density_df = combined_df.groupby([grouping_element], as_index=False)[
        ["population_count", "area_km2"]].sum()
    density_df["pop_density"] = density_df["population_count"] / \
        density_df["area_km2"]
    return density_df

# This function checks wether or not a store is part of a mall or not


def is_mall(stores_df):
    df = stores_df.copy()
    df["is_mall"] = df["mall_name"].notna()
    return df[["store_id", "mall_name", "is_mall"]]

# This function checks wether or not a store is part of a chain or not


def is_chain(stores_df):
    df = stores_df.copy()
    df["is_chain"] = df["chain_name"].notna()
    return df[["store_id", "chain_name", "is_chain"]]

# This function calculates the population count per number of stores in a geographic region


def population_per_store(age_df, geo_df, stores_df, grouping_element):
    new_geo_df = geo_df[geo_df["year"] == 2016]
    pop_gk = population(age_df)
    pop_df = population_grouped(age_df, geo_df, grouping_element)
    combined_df = pop_gk.merge(stores_df, how="left", on="grunnkrets_id").merge(
        new_geo_df, how="left", on="grunnkrets_id")
    grouped_df = combined_df.groupby([grouping_element], as_index=False)[
        "store_id"].count()
    pop_per_store_df = grouped_df.merge(
        pop_df, how="inner", on=grouping_element)
    pop_per_store_df["population_per_num_stores"] = pop_per_store_df["population_count"] / \
        pop_per_store_df["store_id"]
    return pop_per_store_df

# This function groups the age distrubution (0-90) into 7 buckets with and returns a table which represents the presentages each of these
# buckets corresponds to compared with the total amount of people living in the given geographic region


def age_distrubution(grunnkrets_age_df, geographic_df, grouping_element):
    age_df = grunnkrets_age_df[grunnkrets_age_df["year"] == 2016]
    age_df1 = age_df.drop(["year"], axis=1)
    age_df1["kids"] = age_df1.iloc[:, 1:8].sum(axis=1)
    age_df1["kids+"] = age_df1.iloc[:, 8:14].sum(axis=1)
    age_df1["youths"] = age_df1.iloc[:, 14: 19].sum(axis=1)
    age_df1["youthAdult"] = age_df1.iloc[:, 19:27].sum(axis=1)
    age_df1["adult"] = age_df1.iloc[:, 27:37].sum(axis=1)
    age_df1["adults+"] = age_df1.iloc[:, 37:62].sum(axis=1)
    age_df1["pensinors"] = age_df1.iloc[:, 62:92].sum(axis=1)

    age_df2 = age_df1[["grunnkrets_id", "kids", "kids+",
                       "youths", "youthAdult", "adult", "adults+", "pensinors"]]

    pop_df = population(grunnkrets_age_df)
    geo_df = geographic_df[geographic_df["year"] == 2016]
    new_geo_df = geo_df.drop(["geometry", "area_km2", "year"], axis=1)
    combined_df = age_df2.merge(pop_df, how="inner", on="grunnkrets_id").merge(
        new_geo_df, how="inner", on="grunnkrets_id")
    list_columns = ["kids", "kids+", "youths",
                    "youthAdult", "adult", "adults+", "pensinors"]
    combined_df2 = combined_df.groupby([grouping_element], as_index=False)[
        list_columns].sum()

    pop_gk = population_grouped(
        grunnkrets_age_df, geographic_df, grouping_element)
    new_df = combined_df2.merge(pop_gk, how="inner", on=grouping_element)

    new_df["kids_%"] = new_df["kids"] / new_df["population_count"]
    new_df["kids+_%"] = new_df["kids+"] / new_df["population_count"]
    new_df["youths_%"] = new_df["youths"] / new_df["population_count"]
    new_df["youthAdult_%"] = new_df["youthAdult"] / new_df["population_count"]
    new_df["adult_%"] = new_df["adult"] / new_df["population_count"]
    new_df["adults+_%"] = new_df["adults+"] / new_df["population_count"]
    new_df["pensinors_%"] = new_df["pensinors"] / new_df["population_count"]

    if (grouping_element == "grunnkrets_id"):
        return new_df[["grunnkrets_id", "kids_%", "kids+_%", "youths_%", "youthAdult_%", "adult_%", "adults+_%", "pensinors_%"]]
    else:
        return new_df[[grouping_element, "kids_%", "kids+_%", "youths_%", "youthAdult_%", "adult_%", "adults+_%", "pensinors_%"]]

# This function calculates the total amount of household types based on a geographic area


def household_type_distrubution(geographic_df, household_df, grouping_element):
    house_df = household_df[household_df["year"] == 2016]
    geo_df = geographic_df[geographic_df["year"] == 2016]
    combined_df = geo_df.merge(house_df, how="inner", on="grunnkrets_id")

    list_columns = ["couple_children_0_to_5_years", "couple_children_18_or_above", "couple_children_6_to_17_years",
                    "couple_without_children", "single_parent_children_0_to_5_years", "single_parent_children_18_or_above",
                    "single_parent_children_6_to_17_years", "singles"]

    grouped_df = combined_df.groupby([grouping_element], as_index=False)[
        list_columns].sum()

    return grouped_df


# Simens functions
def average_revenue_of_chain(dataset_stores):
    "Average revenue of chains in datasett"
    dataset_stores = dataset_stores[(dataset_stores["year"] == 2016)]
    return dataset_stores.groupby(['chain_name'])['revenue'].mean()


def average_revenue_of_mall(dataset_stores):
    "Average revenue of malls in dataset"
    dataset_stores = dataset_stores[(dataset_stores["year"] == 2016)]
    return dataset_stores.groupby(['mall_name'])['revenue'].mean()


def mean_income_per_capita(dataset_age, dataset_income):
    "mean income per capita per grunnkrets"
    age_df = population(dataset_age)
    income_df = dataset_income[dataset_income["year"] == 2016]
    age_and_income_df = age_df.merge(income_df, how='left', on='grunnkrets_id')
    mean_income = age_and_income_df.drop(['year', 'singles', 'couple_without_children',
                                         'couple_with_children', 'other_households', 'single_parent_with_children'], axis=1)
    mean_income['mean_income'] = mean_income['all_households'] / \
        mean_income['population_count']
    mean_income = mean_income.drop(['all_households'], axis=1)

    return mean_income


def mean_income_per_capita_grouped(dataset_age, dataset_income, dataset_geography, grouping_element):
    # gets data from mean_income_per_capita functino
    data_mean_income = mean_income_per_capita(dataset_age, dataset_income)
    # gets data from geography set and makes sure we only use data for 2016
    geography_df = dataset_geography[dataset_geography["year"] == 2016]
    # gets the data of mean income with the geography data
    mean_income_geo_df = data_mean_income.merge(
        geography_df, how='left', on='grunnkrets_id')
    # sum the number of people based on grouping element
    grouped_population_df = mean_income_geo_df.groupby(
        [grouping_element], as_index=False)["population_count"].sum()
    # merge this with the grunnkrets to see both total population per selected area and grunnkrets
    total_grouped_df = mean_income_geo_df.merge(
        grouped_population_df, how='left', on=grouping_element)
    portion_income_df = total_grouped_df
    # find ration of grunnkrets to total population and multiply this with grunnkrets mean income
    portion_income_df['mean_income'] = total_grouped_df['mean_income'] * \
        total_grouped_df['population_count_x'] / \
        total_grouped_df['population_count_y']
    # add these incomes together, should add up to the total mean income for the selected area
    grouped_income_df = portion_income_df.groupby(
        [grouping_element], as_index=False)["mean_income"].sum()
    return grouped_income_df


    

In [57]:
#Reading the datasets 
busstops = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/busstops_norway.csv')
grunnkrets_age = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_age_distribution.csv')
grunnkrets_household = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_households_num_persons.csv')
grunnkrets_income = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_income_households.csv')
grunnkrets_norway = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/grunnkrets_norway_stripped.csv')
plaace_hierarchy = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/plaace_hierarchy.csv')
sample_submission = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/sample_submission.csv')
stores_extra = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/stores_extra.csv')
stores_test = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/stores_test.csv')
stores_train = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/raw/stores_train.csv')

# Combined dataset based on grunnkrets_id

In [81]:
pop_count = population_grouped(grunnkrets_age, grunnkrets_norway, grouping_element="grunnkrets_id")
pop_count_district = population_grouped(grunnkrets_age, grunnkrets_norway, grouping_element="district_name")
pop_count_municipality = population_grouped(grunnkrets_age, grunnkrets_norway, grouping_element="municipality_name")
pop_count_municipality

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population


Unnamed: 0,municipality_name,population_count
0,Agdenes,1683
1,Alstahaug,7189
2,Alta,19443
3,Alvdal,2202
4,Andebu,5837
...,...,...
417,Østre Toten,14692
418,Øvre Eiker,18002
419,Øyer,4989
420,Øygarden,4800


In [82]:
pop_density = population_density(grunnkrets_age, grunnkrets_norway, grouping_element="grunnkrets_id")
pop_density_district = population_density(grunnkrets_age, grunnkrets_norway, grouping_element="district_name")
pop_density_municipality = population_density(grunnkrets_age, grunnkrets_norway, grouping_element="municipality_name")
pop_density_municipality

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population


Unnamed: 0,municipality_name,population_count,area_km2,pop_density
0,Agdenes,1683,24.198368,69.550145
1,Alstahaug,7189,34.378517,209.113153
2,Alta,19443,44.021857,441.666961
3,Alvdal,2202,30.079660,73.205615
4,Andebu,5837,37.999336,153.607948
...,...,...,...,...
417,Østre Toten,14692,151.675968,96.864389
418,Øvre Eiker,18002,79.545717,226.310112
419,Øyer,4989,38.138484,130.812750
420,Øygarden,4800,10.215312,469.882842


In [60]:
is_mall_df = is_mall(stores_train)
is_mall_df

Unnamed: 0,store_id,mall_name,is_mall
0,983540538-974187930-44774,Magasinet Drammen,True
1,987074191-973117734-44755,,False
2,984890265-981157303-64491,Kuben Hønefoss,True
3,914057442-992924179-126912,Glasshuspassasjen,True
4,913018583-913063538-668469,Tillertorget,True
...,...,...,...
12854,915789943-915806929-781991,,False
12855,917921733-917982368-868081,CC Gjøvik,True
12856,911721961-911764474-496764,Strømmen Storsenter,True
12857,914337046-914343372-721294,Bystasjonen,True


In [61]:
is_chain_df = is_chain(stores_train)
is_chain_df

Unnamed: 0,store_id,chain_name,is_chain
0,983540538-974187930-44774,MCDONALDS,True
1,987074191-973117734-44755,MCDONALDS,True
2,984890265-981157303-64491,BURGER KING,True
3,914057442-992924179-126912,BURGER KING,True
4,913018583-913063538-668469,BURGER KING,True
...,...,...,...
12854,915789943-915806929-781991,,False
12855,917921733-917982368-868081,GULATING GRUPPEN,True
12856,911721961-911764474-496764,GULATING GRUPPEN,True
12857,914337046-914343372-721294,,False


In [83]:
pop_per_store_df = population_per_store(grunnkrets_age, grunnkrets_norway, stores_train, grouping_element="grunnkrets_id")
pop_per_store_district_df = population_per_store(grunnkrets_age, grunnkrets_norway, stores_train, grouping_element="district_name")
pop_per_store_municipality_df = population_per_store(grunnkrets_age, grunnkrets_norway, stores_train, grouping_element="municipality_name")
pop_per_store_municipality_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

Unnamed: 0,municipality_name,store_id,population_count,population_per_num_stores
0,Agdenes,1,1683,1683.000000
1,Alstahaug,22,7189,326.772727
2,Alta,56,19443,347.196429
3,Alvdal,3,2202,734.000000
4,Andebu,6,5837,972.833333
...,...,...,...,...
417,Østre Toten,30,14692,489.733333
418,Øvre Eiker,39,18002,461.589744
419,Øyer,10,4989,498.900000
420,Øygarden,8,4800,600.000000


In [84]:
age_dist_df = age_distrubution(grunnkrets_age, grunnkrets_norway, grouping_element="grunnkrets_id")
age_dist_district_df = age_distrubution(grunnkrets_age, grunnkrets_norway, grouping_element="district_name")
age_dist_municipality_df = age_distrubution(grunnkrets_age, grunnkrets_norway, grouping_element="municipality_name")
age_dist_municipality_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

Unnamed: 0,municipality_name,kids_%,kids+_%,youths_%,youthAdult_%,adult_%,adults+_%,pensinors_%
0,Agdenes,0.062389,0.061200,0.074866,0.095068,0.090909,0.324421,0.291147
1,Alstahaug,0.073585,0.068716,0.065656,0.108082,0.123105,0.324663,0.236194
2,Alta,0.092475,0.082086,0.076840,0.123798,0.142982,0.310086,0.171733
3,Alvdal,0.089010,0.075386,0.070391,0.098547,0.101726,0.332879,0.232062
4,Andebu,0.085489,0.076238,0.065787,0.102621,0.140312,0.336132,0.193421
...,...,...,...,...,...,...,...,...
417,Østre Toten,0.070038,0.064457,0.061938,0.095971,0.107746,0.337122,0.262728
418,Øvre Eiker,0.081380,0.074992,0.063660,0.095267,0.132485,0.336129,0.216087
419,Øyer,0.063941,0.063941,0.068551,0.110643,0.111445,0.338344,0.243135
420,Øygarden,0.098333,0.085000,0.069167,0.102292,0.117500,0.326875,0.200833


In [85]:
house_type_dist = household_type_distrubution(grunnkrets_norway,grunnkrets_household, grouping_element="grunnkrets_id")
house_type_dist_district = household_type_distrubution(grunnkrets_norway,grunnkrets_household, grouping_element="district_name")
house_type_dist_municipality = household_type_distrubution(grunnkrets_norway,grunnkrets_household, grouping_element="municipality_name")
house_type_dist_municipality

Unnamed: 0,municipality_name,couple_children_0_to_5_years,couple_children_18_or_above,couple_children_6_to_17_years,couple_without_children,single_parent_children_0_to_5_years,single_parent_children_18_or_above,single_parent_children_6_to_17_years,singles
0,Agdenes,233,181,315,424,34,56,60,322
1,Alstahaug,1180,506,1477,1788,165,175,322,1435
2,Alta,3995,1525,4283,3580,501,673,1209,3511
3,Alvdal,435,205,523,460,53,56,77,452
4,Andebu,1140,410,1273,1410,99,135,252,1000
...,...,...,...,...,...,...,...,...,...
417,Østre Toten,2296,1283,2894,3776,162,445,723,2823
418,Øvre Eiker,3255,1490,3834,4072,270,573,875,3246
419,Øyer,674,472,1050,1270,63,163,220,942
420,Øygarden,1052,347,1132,1052,108,144,162,706


In [101]:
combined1 = pop_count.merge(pop_density, how = "left", on = "grunnkrets_id")
combined2 = combined1.merge(pop_per_store_df, how ="left", on = "grunnkrets_id")
combined3 = combined2.merge(age_dist_df, how ="left", on = "grunnkrets_id")
combined4 = house_type_dist.merge(combined3,  how = "left", on = "grunnkrets_id")
combined4.columns

grunnkrets_id_df = combined4.drop(["population_count_y", "population_count_x", "area_km2"], axis = 1)
grunnkrets_id_df

grunnkrets_id_df.csv("grunnkrets_id_df")

AttributeError: 'DataFrame' object has no attribute 'csv'

# Combined dataset based on district_name 

In [80]:
district_df_1 = (pop_count_district
.merge(pop_density_district, how = "left", on = "district_name")
.merge(pop_per_store_district_df, how = "left", on = "district_name")
.merge(age_dist_district_df, how = "left", on = "district_name")
.merge(house_type_dist_district, how = "left", on ="district_name"))

district_df = district_df_1.drop(["population_count_x", "population_count_y", "area_km2"], axis = 1)
district_df


Unnamed: 0,district_name,pop_density,store_id,population_count,population_per_num_stores,kids_%,kids+_%,youths_%,youthAdult_%,adult_%,adults+_%,pensinors_%,couple_children_0_to_5_years,couple_children_18_or_above,couple_children_6_to_17_years,couple_without_children,single_parent_children_0_to_5_years,single_parent_children_18_or_above,single_parent_children_6_to_17_years,singles
0,Agdenes,69.550145,1,1683,1683.000000,0.062389,0.061200,0.074866,0.095068,0.090909,0.324421,0.291147,233,181,315,424,34,56,60,322
1,Alfaset,5952.965088,16,18998,1187.375000,0.096168,0.072113,0.054006,0.088746,0.152648,0.338983,0.197337,4259,1399,3398,3470,355,500,835,4801
2,Algarheim,289.563424,0,5780,inf,0.108824,0.097578,0.076471,0.106574,0.159516,0.338927,0.112111,1368,314,1622,990,127,170,391,825
3,Alstad,3108.781543,5,6400,1280.000000,0.097812,0.091719,0.083906,0.124219,0.150156,0.335938,0.116250,1374,443,1853,1160,112,172,411,873
4,Alstahaug/Tjøtta,33.401471,1,817,817.000000,0.042840,0.066095,0.080783,0.080783,0.094247,0.341493,0.293758,97,79,234,274,0,18,9,208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1477,Øymark,115.672993,5,2587,517.400000,0.063780,0.065713,0.068806,0.093931,0.090452,0.322768,0.294550,397,170,519,720,14,34,93,537
1478,Øyrekken,1305.617458,0,138,inf,0.050725,0.043478,0.057971,0.086957,0.050725,0.333333,0.376812,13,16,26,86,7,6,12,86
1479,Øystese,213.159287,10,2719,271.900000,0.087900,0.082751,0.061052,0.103347,0.107760,0.322913,0.234277,561,248,626,564,27,64,71,437
1480,Øystre Slidre nord,80.460331,17,1569,92.294118,0.060548,0.061185,0.052900,0.088591,0.152326,0.334608,0.249841,218,130,315,334,13,41,52,362


# Combined dataset based on municipality_name 


In [88]:
municipality_df1 =(pop_count_municipality
.merge(pop_density_municipality, how = "left", on= "municipality_name")
.merge(pop_per_store_municipality_df, how ="left", on = "municipality_name")
.merge(age_dist_municipality_df, how = "left", on ="municipality_name")
.merge(house_type_dist_municipality, how = "left", on = "municipality_name"))

municipality_df = municipality_df1.drop(["population_count_x", "population_count_y", "area_km2"], axis = 1)
municipality_df



Unnamed: 0,municipality_name,pop_density,store_id,population_count,population_per_num_stores,kids_%,kids+_%,youths_%,youthAdult_%,adult_%,adults+_%,pensinors_%,couple_children_0_to_5_years,couple_children_18_or_above,couple_children_6_to_17_years,couple_without_children,single_parent_children_0_to_5_years,single_parent_children_18_or_above,single_parent_children_6_to_17_years,singles
0,Agdenes,69.550145,1,1683,1683.000000,0.062389,0.061200,0.074866,0.095068,0.090909,0.324421,0.291147,233,181,315,424,34,56,60,322
1,Alstahaug,209.113153,22,7189,326.772727,0.073585,0.068716,0.065656,0.108082,0.123105,0.324663,0.236194,1180,506,1477,1788,165,175,322,1435
2,Alta,441.666961,56,19443,347.196429,0.092475,0.082086,0.076840,0.123798,0.142982,0.310086,0.171733,3995,1525,4283,3580,501,673,1209,3511
3,Alvdal,73.205615,3,2202,734.000000,0.089010,0.075386,0.070391,0.098547,0.101726,0.332879,0.232062,435,205,523,460,53,56,77,452
4,Andebu,153.607948,6,5837,972.833333,0.085489,0.076238,0.065787,0.102621,0.140312,0.336132,0.193421,1140,410,1273,1410,99,135,252,1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,Østre Toten,96.864389,30,14692,489.733333,0.070038,0.064457,0.061938,0.095971,0.107746,0.337122,0.262728,2296,1283,2894,3776,162,445,723,2823
418,Øvre Eiker,226.310112,39,18002,461.589744,0.081380,0.074992,0.063660,0.095267,0.132485,0.336129,0.216087,3255,1490,3834,4072,270,573,875,3246
419,Øyer,130.812750,10,4989,498.900000,0.063941,0.063941,0.068551,0.110643,0.111445,0.338344,0.243135,674,472,1050,1270,63,163,220,942
420,Øygarden,469.882842,8,4800,600.000000,0.098333,0.085000,0.069167,0.102292,0.117500,0.326875,0.200833,1052,347,1132,1052,108,144,162,706


# Stores_dataset combined with is_mall and is_chain 

In [100]:
stores_df = stores_train
is_mall_df = is_mall(stores_df).drop(["mall_name"], axis = 1)
is_chain_df = is_chain(stores_df).drop(["chain_name"], axis = 1)

new_stores_df = stores_df.merge(is_mall_df, how = "left", on = "store_id").merge(is_chain_df, how = "left", on = "store_id")
new_stores_df



Unnamed: 0,store_id,year,store_name,plaace_hierarchy_id,sales_channel_name,grunnkrets_id,address,lat,lon,chain_name,mall_name,revenue,is_mall,is_chain
0,983540538-974187930-44774,2016,MCDONALD'S BRAGERNES TORG MAGASINET,1.1.1.0,Hamburger restaurants,6020303,BRAGERNES TORG 13,59.743104,10.204928,MCDONALDS,Magasinet Drammen,17.998,True,True
1,987074191-973117734-44755,2016,MCDONALD'S KLINGENBERGGATA,1.1.1.0,Hamburger restaurants,3010306,,59.913759,10.734031,MCDONALDS,,23.828,False,True
2,984890265-981157303-64491,2016,BURGER KING HØNEFOSS,1.1.1.0,Hamburger restaurants,6050102,KONG RINGS GATE 1,60.164751,10.254656,BURGER KING,Kuben Hønefoss,16.099,True,True
3,914057442-992924179-126912,2016,BURGER KING GLASSHUSPASSASJEN,1.1.1.0,Hamburger restaurants,18040102,STORGATA 12,67.283669,14.379796,BURGER KING,Glasshuspassasjen,9.296,True,True
4,913018583-913063538-668469,2016,BURGER KING TILLERTORGET,1.1.1.0,Hamburger restaurants,16017414,,63.358068,10.374832,BURGER KING,Tillertorget,4.528,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12854,915789943-915806929-781991,2016,MEIERIGÅRDEN BRYGGERIUTSALG,2.8.11.2,Beer and soda shop,7010705,THUEGATA 2,59.416276,10.480970,,,0.088,False,False
12855,917921733-917982368-868081,2016,GULATING ØLUTSALG CC GJØVIK,2.8.11.2,Beer and soda shop,5020406,,60.799991,10.693635,GULATING GRUPPEN,CC Gjøvik,1.816,True,True
12856,911721961-911764474-496764,2016,GULATING ØLUTSALG STRØMMEN,2.8.11.2,Beer and soda shop,2310803,STØPERIVEIEN 6,59.946562,11.007659,GULATING GRUPPEN,Strømmen Storsenter,38.225,True,True
12857,914337046-914343372-721294,2016,DET GODE BRYGG,2.8.11.2,Beer and soda shop,11020113,VÅGSGATA 16,58.850261,5.735674,,Bystasjonen,3.642,True,False
