In [1]:
import pandas as pd
import psycopg2
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [3]:
conn = psycopg2.connect(database="db_sync", user="patrick", password="patrick")

In [4]:
def sql(sql: str):
    return pd.read_sql(sql, conn)

In [5]:
def db_query():
    
    soc = sql("""
    SELECT b.business_id, b.name, b.city, b.state, b.stars, c.categories, s.* FROM socio s
    JOIN business b ON CAST(s.postal AS VARCHAR) = b.postal
    JOIN has_categorie h ON b.business_id = h.business_id
    JOIN categorie c ON h.categorie_id = c.categorie_id;
    """)
    
    return soc

In [5]:
# Restaurants pro postal code
def count_restaurants_per_postal(soc):
  
    soc_res = soc.groupby(by = "postal").agg({"business_id" : "count"})
    soc_res = soc_res.reset_index()
    soc_res.columns = ["postal", "restaurant_count"]
    soc_res = soc_res.loc[soc_res["restaurant_count"] > 5]
    
    return soc_res

In [20]:
# Korrelationen zwischen Anteil der Restaurants mit bestimmten Kategorien und Bevölkerungsmerkmal
def corr_socio_category(soc, soc_res, cat_list):
    soc_cat = soc[soc["categories"].isin(cat_list)]
    socios = ["mean_income", "unemployment_rate", "native_ratio", "asian_ratio", "pacific_ratio", "other_ratio", "hispanic_ratio", "white_ratio",
               "below_poverty_ratio", "above_poverty_ratio", "no_highschool_ratio", "highschool_ratio", 
               "college_ratio", "bachelor_ratio", "male_ratio", "female_ratio"]
    dict_corr_coeff = {}
    
    for socio in socios:
        
        soc_cat_agg = soc_cat.groupby(by = "postal").agg({"categories" : ['count'] , socio : ['max']})
        soc_cat_agg = soc_cat_agg.reset_index()
        soc_cat_agg.columns = ["postal", "categories", socio]
    
        soc_cat_rate = pd.merge(soc_res, soc_cat_agg, on = "postal")
        soc_cat_rate["categories_ratio"] = soc_cat_rate.apply(lambda row: row.categories/row.restaurant_count, axis = 1)
    
        if socio == 'unemployment_rate':
            soc_cat_rate[[socio]] = soc_cat_rate[[socio]].astype(float)
            
        corr_coeff = soc_cat_rate["categories_ratio"].corr(soc_cat_rate[socio])
        
        dict_corr_coeff[socio] = corr_coeff
        
    df_corr_coeff = pd.DataFrame.from_dict(dict_corr_coeff, orient='index').reset_index()
    df_corr_coeff.columns = ['feature', 'coeff']
    
    return df_corr_coeff

In [7]:
def calculate_score(corr_coeff):
    weighted_matrix = corr_coeff["coeff"].to_numpy()
    soc_postal = sql("""SELECT postal, mean_income, unemployment_rate, native_ratio, asian_ratio, 
                        pacific_ratio, other_ratio, hispanic_ratio, white_ratio,
                        below_poverty_ratio, above_poverty_ratio, no_highschool_ratio, highschool_ratio, 
                        college_ratio, bachelor_ratio, male_ratio, female_ratio
                        FROM socio""")
    soc_postal.set_index("postal", inplace = True)
    soc_postal_normalize = (soc_postal - soc_postal.mean())/soc_postal.std()
    
    weighted_soc = soc_postal_normalize * weighted_matrix
    weighted_soc["total_score"]  = weighted_soc.sum(axis = 1)
    weighted_soc_sort = weighted_soc.sort_values(by = "total_score", ascending = False)
    
    return weighted_soc_sort

In [21]:
soc = db_query()
soc_res = count_restaurants_per_postal(soc)
categories = ["Thai", "Taiwanese", "Pan Asian", "Sushi Bars"]
corr_coeff = corr_socio_category(soc, soc_res, categories)
corr_coeff

Unnamed: 0,feature,coeff
0,mean_income,0.057123
1,unemployment_rate,-0.023846
2,native_ratio,-0.001141
3,asian_ratio,0.11219
4,pacific_ratio,-0.001302
5,other_ratio,0.131392
6,hispanic_ratio,0.007221
7,white_ratio,0.062132
8,below_poverty_ratio,-0.06308
9,above_poverty_ratio,-0.046696


In [22]:
score = calculate_score(corr_coeff)

In [23]:
score = score.reset_index()

In [28]:
from sklearn.preprocessing import MinMaxScaler

In [34]:
scaler = MinMaxScaler()
score[['total_score']] = scaler.fit_transform(score[['total_score']])

In [35]:
score

Unnamed: 0,postal,above_poverty_ratio,asian_ratio,bachelor_ratio,below_poverty_ratio,college_ratio,female_ratio,highschool_ratio,hispanic_ratio,male_ratio,mean_income,native_ratio,no_highschool_ratio,other_ratio,pacific_ratio,unemployment_rate,white_ratio,total_score
0,21705,-0.247546,0.00945,0.001509,1.250296,0.003244,1.136042,0.058406,-0.034179,0.263339,0.096686,-0.009678,0.098962,0.022267,-0.00004,,0.193502,1.000000
1,87032,-0.154402,0.00945,-0.000518,0.941914,-0.000402,1.136042,-0.014928,0.369486,0.263339,0.096686,-0.009678,0.098962,-0.211871,-0.00004,,0.193502,0.977843
2,41713,-0.247546,0.00945,0.001509,1.250296,0.003244,1.136042,0.058406,-0.034179,0.263339,0.096686,-0.009678,0.098962,0.022267,-0.00004,,-0.053408,0.956136
3,31081,-0.247546,0.00945,0.001509,1.250296,0.003244,1.136042,0.058406,-0.034179,0.263339,0.096686,-0.009678,0.098962,0.022267,-0.00004,,-0.053408,0.956136
4,26228,-0.247546,0.00945,0.001509,1.250296,0.003244,1.136042,0.058406,-0.034179,0.263339,0.096686,-0.009678,0.098962,0.022267,-0.00004,,-0.053408,0.956136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32630,36267,0.167552,0.00945,0.001509,-0.124015,0.003244,-0.603102,-0.014928,-0.034179,-0.477360,0.096686,-0.009678,-1.377848,0.022267,-0.00004,,-0.053408,0.069793
32631,43032,0.167552,0.00945,0.001509,-0.124015,0.003244,-0.603102,-0.014928,-0.034179,-0.477360,0.096686,-0.009678,-1.377848,0.022267,-0.00004,,-0.053408,0.069793
32632,40979,0.167552,0.00945,0.001509,-0.124015,0.003244,-0.603102,-0.014928,-0.034179,-0.477360,0.096686,-0.009678,-1.377848,0.022267,-0.00004,,-0.053408,0.069793
32633,40816,0.167552,0.00945,0.001509,-0.124015,0.003244,-0.603102,-0.014928,-0.034179,-0.477360,0.096686,-0.009678,-1.377848,0.022267,-0.00004,,-0.053408,0.069793


In [2]:
"test".to_list()

AttributeError: 'str' object has no attribute 'to_list'

In [15]:
[x.strip() for x in 'Mexican, Fast Food '.split(',')]

['Mexican', 'Fast Food']