# Libraries, functions, and globals.

In [1]:
import mysql.connector
import csv
import numpy as np
import pandas as pd
import math


host = "localhost"
un = "root"
pw=""
db_name = "yelp"
fp = "C:/Users/Tolis/Documents/Data Analytics Cource/CKME136 X10/Project/data/final/profiles"
fn = "business_profiles.csv"
file_info = fp + "/" + fn

def mysql_result_to_df(result, cursor):
    field_names = [i[0] for i in mycursor.description]
    return pd.DataFrame(myresult, columns=field_names)


def df_to_csv(df, fp, ext=".csv", na_rep=""):
    try:
        df.to_csv(fp + ext, encoding="utf-8", header = True,\
            doublequote = True, sep=",", index=False, na_rep=na_rep)
    except Exception as e:
        print("Error: {}".format(str(e)))

# Load user table to obtain a list of users, businesses, and a binary class column (like/dislike)
<p>
The like/dislike field is calculated from the stars col in the user table (which represents a rating from 1 - 5)
<br>The like/dislike field contains two values: 1 for a like, and -1 for a dislike
<br>To determine whether a user likes or dislikes a column, a threshhold was created. If a user rated a business atleast 3 stars, a like (1) will be returned. Otherwise, -1 will be returned if stars is less than 3
</p>

### Connect to mysql and create df.
<p><b>Note:</b> a limit of 500 rows was applied to the sql query.  This was only to speed up my assignment submission and the limit will be removed.  The dataset cobtains 1+ million rows and would have taken too long to run, which may have delayed my submission.</p>

In [2]:
mydb = mysql.connector.connect(
  host=host,
  user=un,
  passwd=pw,
  database=db_name
)

mycursor = mydb.cursor()

q = """
SELECT user_id, business_id,
CASE
    WHEN stars >= 3 THEN 1
    ELSE -1
END AS "like/dislike"
FROM review LIMIT 500;
"""

mycursor.execute(q)

myresult = mycursor.fetchall()

user_rating_df = mysql_result_to_df(myresult, mycursor)

print(user_rating_df.head())
mycursor.close()
mydb.close()

                  user_id             business_id  like/dislike
0  zyp8SaRnZ94sWZpLrifS1Q  l6xZVTEtdZAvNpL1JhYGuw             1
1  697iJkhX1mkVF9RNhn114Q  XiXu6WHbDoopKpeg7DfKdQ             1
2  E6Aoz-3s4avfweIjziHjbA  cTbFJzHQzFSX-z3JF4abKQ             1
3  iVSuN8PrtKVtLzhNiu23uA  OumGHdbdp7WgyYMhcAdjhw            -1
4  iUdH5Sats3cF46CePD6CYw  vNoyICtClJd3pcMG96CXsA             1


# Load business profiles csv and normalize
<p>*Normalization is done because the fields are binary (1/0), which can cause a div by 0 error in the TF formula.
<br>The formula that is used to normalize the bin fields is: bin_val (1/0) / sqrt(row_rum).
<br>Where row_sum equals the total amount of features (1's) that a business has</p>

In [3]:
def normalize_bin_data(row):
    return row / math.sqrt(sum(row))


business_profs_df = pd.read_csv(file_info)

# Temp var to store id's
business_ids = business_profs_df["business_id"]

# Drop ids to only obtain numeric values (1/0)
# This makes it easy to normalize inside the apply function
business_profs_df = business_profs_df.drop(["business_id"], axis=1)

# Normalize data and re-insert business id's
business_profs_df = business_profs_df.apply(normalize_bin_data, axis=1)
business_profs_df.insert (0, "business_id", business_ids)
business_ids = None

# Join business profiles with user data and calculate user profile vectors

In [4]:
bus_profiles_with_user = pd.merge(business_profs_df,
                                  user_rating_df,
                                  on="business_id",
                                  how="inner")
bus_profiles_with_user.head()

Unnamed: 0,business_id,lot,garage,valet,street,validated,lunch,dinner,brunch,breakfast,...,cuyahoga-fls-oh,walton-hills-oh,highland-hills-oh,tottenham-on,fairport-harbor-oh,russellton-pa,mcadenville-nc,litchfield-az,user_id,like/dislike
0,-1UMR00eXtwaeh59pEiDjA,0.0,0.0,0.0,0.0,0.0,0.229416,0.0,0.229416,0.229416,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,kbiDmK1tFKHJx_rerKHjEg,1
1,-9nai28tnoylwViuJVrYEQ,0.0,0.235702,0.0,0.235702,0.0,0.0,0.235702,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TCkW0F4SSuxmCguyJ-_E5Q,1
2,-Ca6Hx-G1kOR8ycPRybr5A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SqflUAwupx4d_wrwyO6GvA,1
3,-CfFjcCcGGDM9MVH_d42RQ,0.188982,0.0,0.188982,0.0,0.0,0.188982,0.188982,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bdw4E8jFVd6-CbhrNAJ_EA,1
4,-PbCfkydmvuNcG9VG_ixkQ,0.267261,0.0,0.0,0.0,0.0,0.267261,0.267261,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,sVVgZBFwdNWVP7i91Jqnuw,-1


### Calculate scores for each feature by user id

In [5]:
user_ids = user_rating_df["user_id"].unique().tolist()
user_pref_scores = []

def user_like_products(row):
    # multiplies the normalized binary value from business profile, 
    # by the like/dislike value
    return row[:-1] * row[-1]

for i,v in enumerate(user_ids):
    
    # find rows for current user
    which_user_rows = bus_profiles_with_user["user_id"] == v
    df_filtered = bus_profiles_with_user[which_user_rows]
    
    # Obtain numeric cols and multiply feature values by user like/dislike value
    df_filtered = df_filtered.select_dtypes(include=np.number)
    df_filtered = df_filtered.apply(user_like_products, axis=1)
    
    #Obtain the sum of the feature values * like/dislike by column
    #The greater the value here, the better the user likes a feature.
    sum_products = df_filtered.sum(axis=0)
    
    #Append user_id and sum_products (feature scores) to list
    user_pref_scores.append([v] + sum_products.tolist())
        

print(len(user_pref_scores))

500


In [6]:
#Column names are user_id and everything in the business_profile except business_id
columns = ["user_id"] + business_profs_df.columns.tolist()[1:]

user_pref_scores = pd.DataFrame(user_pref_scores, columns=columns)
user_pref_scores.head()

Unnamed: 0,user_id,lot,garage,valet,street,validated,lunch,dinner,brunch,breakfast,...,oberlin-oh,pheonix-az,cuyahoga-fls-oh,walton-hills-oh,highland-hills-oh,tottenham-on,fairport-harbor-oh,russellton-pa,mcadenville-nc,litchfield-az
0,zyp8SaRnZ94sWZpLrifS1Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,697iJkhX1mkVF9RNhn114Q,0.242536,0.0,0.0,0.0,0.0,0.0,0.0,0.242536,0.242536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E6Aoz-3s4avfweIjziHjbA,0.0,0.0,0.0,0.0,0.0,0.0,0.377964,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,iVSuN8PrtKVtLzhNiu23uA,-0.208514,0.0,0.0,0.0,0.0,-0.208514,-0.208514,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,iUdH5Sats3cF46CePD6CYw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
user_pref_scores.shape

(500, 1805)

### Output to csv for later use.

In [8]:
fn = "user_profiles"
df_to_csv(user_pref_scores, fp + "/" + fn)

In [9]:
#Clean up
user_pref_scores = None
user_rating_df = None
business_profs_df = None
bus_profiles_with_user = None