In [46]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

In [47]:
stores_train = pd.read_csv('../../data/raw/stores_train.csv')
data_age = pd.read_csv('../../data/raw/grunnkrets_age_distribution.csv')
data_income = pd.read_csv('../../data/raw/grunnkrets_income_households.csv')
data_geography = pd.read_csv('../../data/raw/grunnkrets_norway_stripped.csv') 

In [48]:
def average_revenue_of_chain(dataset_stores):
    "Average revenue of chains in datasett"
    dataset_stores = dataset_stores[(dataset_stores["year"] == 2016)]
    
    return dataset_stores.groupby(['chain_name'])['revenue'].mean()
average_revenue_of_chain(stores_train)

chain_name
3T                    14.179875
AB KLIPP               5.973500
ACE COLLECTION         2.969667
ADAM OG EVA FRISØR     4.208200
ALEX SUSHI            15.575000
                        ...    
ZIZZI                  4.051143
ZOO 1 GRUPPEN          3.505231
ZOOKJEDEN BUDDY        5.296800
ÅPENT BAKERI           5.963000
ØDEGAARD DAME          1.000000
Name: revenue, Length: 306, dtype: float64

In [49]:
def average_revenue_of_mall(dataset_stores):
    "Average revenue of malls in dataset"
    dataset_stores = dataset_stores[(dataset_stores["year"] == 2016)]
    return dataset_stores.groupby(['mall_name'])['revenue'].mean()
average_revenue_of_mall(stores_train)

mall_name
ABC Stormarked        1.468000
Aker Brygge          15.949500
Aksdal Senter         2.721286
Alna Senter          61.234000
Amanda Storsenter     5.030714
                       ...    
Østbanehallen        55.917000
Østerås Senter        4.215500
Østfoldhallene        6.465364
Øyrane Torg           6.160636
Øysenteret            0.128000
Name: revenue, Length: 487, dtype: float64

In [50]:
def population(dataset_age): 
    age_df = dataset_age[(dataset_age["year"] == 2016)]
    population = age_df.drop(["grunnkrets_id", "year"], axis=1).sum(axis=1)
    age_df["population_count"] = population 
    return age_df[["grunnkrets_id", "population_count"]]
population(data_age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population


Unnamed: 0,grunnkrets_id,population_count
1,16013117,944
3,16011203,880
4,3011601,645
8,3010807,528
11,3012307,876
...,...,...
22610,9380111,81
22612,9400101,78
22615,9400104,84
22618,9400108,151


In [51]:
def population_grouped(data_age, data_geography, grouping_element): 
    age_df = population(data_age)
    geography_df = data_geography[data_geography["year"] == 2016]
    population_df = age_df.merge(geography_df, how="left", on ="grunnkrets_id")
    grouped_df = population_df.groupby([grouping_element], as_index = False)["population_count"].sum()
    return grouped_df

In [52]:
def mean_income_per_capita(dataset_age,dataset_income):
    "mean income per capita per grunnkrets"
    age_df = population(data_age)
    income_df = dataset_income[dataset_income["year"] == 2016]
    age_and_income_df = age_df.merge(income_df, how='left', on='grunnkrets_id')
    mean_income = age_and_income_df.drop(['year','singles','couple_without_children','couple_with_children','other_households','single_parent_with_children'],axis=1)
    mean_income['mean_income']=mean_income['all_households']/mean_income['population_count']
    mean_income=mean_income.drop(['all_households'], axis=1)

    return mean_income.he

    
mean_income_per_capita(data_age,data_income)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population


Unnamed: 0,grunnkrets_id,population_count,mean_income
0,16013117,944,441.101695
1,16011203,880,377.045455
2,3011601,645,672.868217
3,3010807,528,713.257576
4,3012307,876,427.853881
...,...,...,...
11299,9380111,81,5674.074074
11300,9400101,78,6275.641026
11301,9400104,84,5827.380952
11302,9400108,151,3241.721854


In [62]:
def mean_income_per_capita_grouped(dataset_income,dataset_geography,grouping_element):
    #gets data from mean_income_per_capita functino
    data_mean_income = mean_income_per_capita(data_age,dataset_income)
    #gets data from geography set and makes sure we only use data for 2016
    geography_df = dataset_geography[dataset_geography["year"] == 2016]
    #gets the data of mean income with the geography data
    mean_income_geo_df = data_mean_income.merge(geography_df, how='left', on='grunnkrets_id')
    #sum the number of people based on grouping element
    grouped_population_df=mean_income_geo_df.groupby([grouping_element], as_index = False)["population_count"].sum()
    #merge this with the grunnkrets to see both total population per selected area and grunnkrets
    total_grouped_df=mean_income_geo_df.merge(grouped_population_df,how='left',on=grouping_element)
    portion_income_df=total_grouped_df
    #find ration of grunnkrets to total population and multiply this with grunnkrets mean income
    portion_income_df['mean_income']=total_grouped_df['mean_income']*total_grouped_df['population_count_x']/total_grouped_df['population_count_y']
    #add these incomes together, should add up to the total mean income for the selected area
    grouped_income_df=portion_income_df.groupby([grouping_element], as_index = False)["mean_income"].sum() 
    return grouped_income_df.head(10)

mean_income_per_capita_grouped(data_income,data_geography,'grunnkrets_id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_df["population_count"] = population


Unnamed: 0,grunnkrets_id,mean_income
0,1010102,619.108280
1,1010103,314.308812
2,1010104,550.708215
3,1010105,165.587734
4,1010107,540.751043
...,...,...
11299,20300501,5639.583333
11300,20300502,4627.350427
11301,20300503,2246.473029
11302,20300504,3321.472393


In [None]:
def make_point()
