In [1]:
from jupyter_dash import JupyterDash
import dash
from dash import dcc
from dash import html
import pandas as pd
import plotly.express as px
from dash.dependencies import Input, Output
import psycopg2
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [2]:
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
conn = psycopg2.connect(database="", user="", password="")
cur = conn.cursor()
def sql(sql: str):
    return pd.read_sql(sql, conn)
scaler = MinMaxScaler()
df_postal_lat_lon = sql("SELECT * FROM socio;")[['postal', 'latitude', 'longitude']]
edf = sql("SELECT * FROM socio;")

def db_query():
    
    soc = sql("""
    SELECT b.business_id, b.name, b.city, b.state, b.stars, c.categories, s.* FROM socio s
    JOIN business b ON CAST(s.postal AS VARCHAR) = b.postal
    JOIN has_categorie h ON b.business_id = h.business_id
    JOIN categorie c ON h.categorie_id = c.categorie_id;
    """)
    soc["unemployment_rate"] = soc["unemployment_rate"].astype(float)

    return soc

# Restaurants pro postal code
def count_restaurants_per_postal(soc):
  
    soc_res = soc.groupby(by = "postal").agg({"business_id" : "count"})
    soc_res = soc_res.reset_index()
    soc_res.columns = ["postal", "restaurant_count"]
    soc_res = soc_res.loc[soc_res["restaurant_count"] > 5]
    
    return soc_res

# Korrelationen zwischen Anteil der Restaurants mit bestimmten Kategorien und Bevölkerungsmerkmal
def corr_socio_category(soc, soc_res, cat_list):
    soc_cat = soc[soc["categories"].isin(cat_list)]
    ''' socios = ["unemployment_rate", "mean_income", "native_ratio", "asian_ratio", "pacific_ratio", "other_ratio", "hispanic_ratio", "white_ratio",
               "below_poverty_ratio", "above_poverty_ratio", "no_highschool_ratio", "highschool_ratio", 
               "college_ratio", "bachelor_ratio", "male_ratio", "female_ratio"] '''
    socios = ["mean_income", "native_ratio", "asian_ratio", "pacific_ratio", "other_ratio", "hispanic_ratio", "white_ratio", "college_ratio"]
    #socios = ["hispanic_ratio"]
    dict_corr_coeff = {}
    
    for socio in socios:
        
        soc_cat_agg = soc_cat.groupby(by = "postal").agg({"categories" : ['count'] , socio : ['max']})
        soc_cat_agg = soc_cat_agg.reset_index()
        soc_cat_agg.columns = ["postal", "categories", socio]
    
        soc_cat_rate = pd.merge(soc_res, soc_cat_agg, on = "postal")
        soc_cat_rate["categories_ratio"] = soc_cat_rate.apply(lambda row: row.categories/row.restaurant_count, axis = 1)
        corr_coeff = soc_cat_rate["categories_ratio"].corr(soc_cat_rate[socio])
        
        dict_corr_coeff[socio] = corr_coeff
        
    df_corr_coeff = pd.DataFrame.from_dict(dict_corr_coeff, orient='index').reset_index()
    df_corr_coeff.columns = ['feature', 'coeff']
    
    return df_corr_coeff

def calculate_score(corr_coeff):
    weighted_matrix = corr_coeff["coeff"].to_numpy()
    
    '''    soc_postal = sql("""SELECT postal, mean_income, native_ratio, asian_ratio, 
                        pacific_ratio, other_ratio, hispanic_ratio, white_ratio,
                        highschool_ratio, 
                        college_ratio, bachelor_ratio, male_ratio, female_ratio
                        FROM socio""")'''
    soc_postal = sql("""SELECT postal, mean_income, native_ratio, asian_ratio, 
                        pacific_ratio, other_ratio, hispanic_ratio, white_ratio,
                        college_ratio
                        FROM socio""")
    #soc_postal = sql("SELECT postal, hispanic_ratio FROM socio")
    
    soc_postal.set_index("postal", inplace = True)
    soc_postal_normalize = (soc_postal - soc_postal.mean())/soc_postal.std()
    
    weighted_soc = soc_postal_normalize * weighted_matrix
    weighted_soc["total_score"]  = weighted_soc.sum(axis = 1)
    weighted_soc_sort = weighted_soc.sort_values(by = "total_score", ascending = False)
    weighted_soc_sort = pd.merge(weighted_soc_sort, df_postal_lat_lon, on = 'postal')
    
    return weighted_soc_sort

app = JupyterDash(__name__, external_stylesheets=external_stylesheets)

# Create server variable with Flask server object for use with gunicorn
server = app.server

app.layout = html.Div([
    html.H1(children='Optimal location finder',style={
            'textAlign': 'center', "color": "white"}),
    html.Div(children='Auswahl',style={"color": "red"}),
    dcc.RadioItems( id = 'radio',
    options=[
        {'label': 'Socio Data', 'value': 'SD'},
        {'label': 'Optimal location finder', 'value': 'LF'}
    ],
    value='SD', style={'color': 'green'}),
    dcc.Dropdown(id='socio_cat',options=[{'label': i, 'value': i} for i in list(edf)[:-2][1:]], value ="unemployment_rate"),
    dcc.Graph(id='scatterplot'),
    dcc.Input(id="input_own_category", type="text", value="Fast Food", style={'marginRight':'10px'}, debounce = True),
    dcc.Dropdown(id='filter_percent',options=[{'label': i, 'value': i/100} for i in list(range(0,101))], value =0)

])


@app.callback(
    Output('scatterplot', 'figure'),
    Input('input_own_category', 'value'),
    Input('socio_cat', 'value'),
    Input('radio', 'value'),
    Input('filter_percent', 'value'))
def update_figure(selected_own_cat, selected_socio_cat, radio_choice, filter_choice):
    
    if radio_choice == 'LF':
        soc = db_query()
        soc_res = count_restaurants_per_postal(soc)
        #categories = [selected_own_cat]
        categories = selected_own_cat.split(",")
        categories = [x.strip() for x in categories]
        try:
            corr_coeff = corr_socio_category(soc, soc_res, categories)
            score = calculate_score(corr_coeff)
            score = score.reset_index()
            score[['total_score']] = scaler.fit_transform(score[['total_score']])
            score = score[score['total_score'] > filter_choice]
            fig = px.scatter_mapbox(score,
                                lat='latitude',
                                lon='longitude',
                                zoom=1,
                                color = "total_score",
                                color_continuous_scale=px.colors.sequential.Viridis)
            fig.update_layout(mapbox_style="open-street-map")
            fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
            print("score for " + selected_own_cat)

            return fig
        except:
            print("There is no category: " + selected_own_cat + " !")
    elif radio_choice == 'SD':
        edf = sql("SELECT * FROM socio;")
        edf["unemployment_rate"] = edf["unemployment_rate"].astype(float)
        edf = edf[edf[selected_socio_cat] > filter_choice]
        fig = px.scatter_mapbox(edf,
                    lat='latitude',
                    lon='longitude',
                    zoom=1,
                    color = selected_socio_cat,
                    color_continuous_scale=px.colors.sequential.Viridis)
        fig.update_layout(mapbox_style="open-street-map")
        fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
        return fig


In [4]:
app.run_server(host = '141.100.70.96', mode='inline')


The 'environ['werkzeug.server.shutdown']' function is deprecated and will be removed in Werkzeug 2.1.



score for Wine Tasting Room
score for Fast Food


In [12]:
list(range(1,11))

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [31]:
soc = db_query()
soc_res = count_restaurants_per_postal(soc)
categories = ["Fast Food"]
corr_coeff = corr_socio_category(soc, soc_res, categories)
score = calculate_score(corr_coeff)
score = score.reset_index()

In [26]:
score

Unnamed: 0,index,postal,mean_income,native_ratio,asian_ratio,pacific_ratio,other_ratio,hispanic_ratio,white_ratio,college_ratio,total_score,latitude,longitude
0,0,32830,0.493658,0.010829,0.053677,0.006136,-0.018410,0.369645,0.448910,1.859345,3.223790,28.3822,-81.5690
1,1,92267,0.493658,0.010829,0.053677,0.006136,-0.018410,-0.034194,0.448910,1.859345,2.819950,34.2872,-114.1430
2,2,18225,0.493658,0.010829,0.053677,0.006136,0.844317,0.369645,0.448910,0.513209,2.740381,40.9806,-75.9713
3,3,61027,0.493658,0.010829,0.053677,0.006136,-0.018410,-0.034194,0.227824,1.859345,2.598865,42.3320,-89.7612
4,4,32530,0.493658,0.010829,0.053677,0.006136,-0.018410,-0.034194,0.076581,1.859345,2.447622,30.5986,-87.0315
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32634,32634,40025,-2.118649,0.010829,-0.385983,0.006136,-0.018410,-0.034194,-0.017524,-0.397449,-2.955243,38.2997,-85.6487
32635,32635,7078,-2.137929,0.010829,-0.581259,0.006136,-0.003143,-0.022255,0.062502,-0.373232,-3.038351,40.7368,-74.3271
32636,32636,11765,-2.477292,0.010829,-0.271644,0.006136,-0.004127,-0.004107,0.004126,-0.329204,-3.065283,40.8857,-73.5526
32637,32637,96729,0.122350,0.005304,-0.082442,-4.134073,-0.016465,-0.016893,0.443097,0.091431,-3.587691,21.1730,-157.0791


In [34]:
score = score[score['total_score'] == 0]


In [35]:
score

Unnamed: 0,index,postal,mean_income,native_ratio,asian_ratio,pacific_ratio,other_ratio,hispanic_ratio,white_ratio,college_ratio,total_score,latitude,longitude
