<span style="font-size: 20px; font-weight: bold;">Project : Yelp Review Linear Regression</span>

<span style="font-size: 15px; font-weight: bold;">1. Import Pandas & Load Datasets</span>

In [1]:
import pandas as pd

In [2]:
pd.options.display.max_columns = 60
pd.options.display.max_colwidth = 500
businesses = pd.read_json('yelp_business.json', lines=True)
reviews = pd.read_json('yelp_review.json', lines=True)
users = pd.read_json('yelp_user.json', lines=True)
checkins = pd.read_json('yelp_checkin.json', lines=True)
tips = pd.read_json('yelp_tip.json', lines=True)
photos = pd.read_json('yelp_photo.json', lines=True)       

<span style="font-size: 15px; font-weight: bold;">2. Merge Datasets into single DataFrame</span>

In [3]:
yelp = pd.merge(businesses, reviews, how='left', on='business_id')

In [4]:
yelp = pd.merge(yelp, users, how='left', on='business_id')
yelp = pd.merge(yelp, checkins, how='left', on='business_id')
yelp = pd.merge(yelp, tips, how='left', on='business_id')
yelp = pd.merge(yelp, photos, how='left', on='business_id')

<span style="font-size: 15px; font-weight: bold;">3. Clean Data using Pandas Drop function</span>

In [5]:
print(yelp.columns)

Index(['address', 'alcohol?', 'attributes', 'business_id', 'categories',
       'city', 'good_for_kids', 'has_bike_parking', 'has_wifi', 'hours',
       'is_open', 'latitude', 'longitude', 'name', 'neighborhood',
       'postal_code', 'price_range', 'review_count', 'stars', 'state',
       'take_reservations', 'takes_credit_cards', 'average_review_age',
       'average_review_length', 'average_review_sentiment',
       'number_funny_votes', 'number_cool_votes', 'number_useful_votes',
       'average_number_friends', 'average_days_on_yelp', 'average_number_fans',
       'average_review_count', 'average_number_years_elite', 'time',
       'weekday_checkins', 'weekend_checkins', 'average_tip_length',
       'number_tips', 'average_caption_length', 'number_pics'],
      dtype='object')


In [6]:
features_to_remove = ['address','attributes','business_id','categories','city','hours','is_open','latitude','longitude','name','neighborhood','postal_code','state','time']
yelp.drop(labels=features_to_remove, axis=1, inplace=True) 

In [7]:
print(yelp.columns)

Index(['alcohol?', 'good_for_kids', 'has_bike_parking', 'has_wifi',
       'price_range', 'review_count', 'stars', 'take_reservations',
       'takes_credit_cards', 'average_review_age', 'average_review_length',
       'average_review_sentiment', 'number_funny_votes', 'number_cool_votes',
       'number_useful_votes', 'average_number_friends', 'average_days_on_yelp',
       'average_number_fans', 'average_review_count',
       'average_number_years_elite', 'weekday_checkins', 'weekend_checkins',
       'average_tip_length', 'number_tips', 'average_caption_length',
       'number_pics'],
      dtype='object')


In [8]:
yelp.isna().any()

alcohol?                      False
good_for_kids                 False
has_bike_parking              False
has_wifi                      False
price_range                   False
review_count                  False
stars                         False
take_reservations             False
takes_credit_cards            False
average_review_age            False
average_review_length         False
average_review_sentiment      False
number_funny_votes            False
number_cool_votes             False
number_useful_votes           False
average_number_friends        False
average_days_on_yelp          False
average_number_fans           False
average_review_count          False
average_number_years_elite    False
weekday_checkins               True
weekend_checkins               True
average_tip_length             True
number_tips                    True
average_caption_length         True
number_pics                    True
dtype: bool

In [9]:
yelp.fillna({'weekday_checkins':0,
           'weekend_checkins':0,
           'average_tip_length':0,
           'number_tips':0,
           'average_caption_length':0,
           'number_pics':0},
          inplace=True)

<span style="font-size: 15px; font-weight: bold;">4. Explore Feature Correlations</span>

In [10]:
full_corr = yelp.corr()
print(full_corr['stars'])

alcohol?                     -0.043332
good_for_kids                -0.030382
has_bike_parking              0.068084
has_wifi                     -0.039857
price_range                  -0.052565
review_count                  0.032413
stars                         1.000000
take_reservations            -0.024486
takes_credit_cards            0.037748
average_review_age           -0.125645
average_review_length        -0.277081
average_review_sentiment      0.782187
number_funny_votes            0.001320
number_cool_votes             0.043375
number_useful_votes          -0.000066
average_number_friends       -0.007629
average_days_on_yelp         -0.038061
average_number_fans          -0.031141
average_review_count         -0.066572
average_number_years_elite   -0.064419
weekday_checkins              0.004130
weekend_checkins              0.007863
average_tip_length           -0.052899
number_tips                   0.014038
average_caption_length        0.000040
number_pics              

Weakest Correlations : average_caption_length, number_useful_votes, weekday_checkins, weekend_checkins, number_pics

<span style="font-size: 15px; font-weight: bold;">5. Add high correlating Data to new DataFrame to base predictions off</span>

In [11]:
all_features = ['alcohol?', 'good_for_kids', 'has_bike_parking', 'has_wifi',
       'price_range', 'review_count', 'take_reservations',
       'takes_credit_cards', 'average_review_age', 'average_review_length',
       'average_review_sentiment', 'number_funny_votes', 'number_cool_votes',
       'number_useful_votes', 'average_number_friends', 'average_days_on_yelp',
       'average_number_fans', 'average_review_count',
       'average_number_years_elite', 'weekday_checkins', 'weekend_checkins',
       'average_tip_length', 'number_tips', 'average_caption_length',
       'number_pics']
featuresdf = yelp[all_features]
rating = yelp['stars']

<span style="font-size: 15px; font-weight: bold;">6. Split data into Train & Test sets using SKLearn library</span>

In [12]:
from sklearn.model_selection import train_test_split
feature_train, feature_test, rating_train, rating_test = train_test_split(featuresdf, rating, test_size=.2, random_state = 1)

<span style="font-size: 15px; font-weight: bold;">7. Create, Test, Train Model using Linear Regression</span>

In [13]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(feature_train, rating_train)

<span style="font-size: 15px; font-weight: bold;">Procedure : Train model first using all features, improve R2 value by elimating low correlating features</span>

In [14]:
model.score(feature_train, rating_train)

0.6807828861895334

In [15]:
model.score(feature_test, rating_test)

0.678212904586925

<span style="font-size: 13px; font-weight: bold;">All Feature R2 = 0.678212904586925
</span>

In [16]:
model.coef_

array([-1.49914986e-01, -1.18078144e-01,  2.72969699e-02, -1.21553826e-01,
       -6.48673015e-02,  1.01122594e-04,  1.41345592e-02,  2.44518379e-02,
       -1.57765441e-04, -5.89625792e-04,  2.28084570e+00,  4.84793510e-05,
        9.72372273e-04, -2.71506413e-04,  2.06958404e-05,  1.23261477e-04,
        1.02679868e-03, -2.33983569e-04, -6.27893971e-02,  6.15390912e-05,
       -9.23961747e-05, -4.20521750e-04, -8.54656332e-04, -6.47274980e-04,
       -1.31336123e-03])

<span style="font-size: 13px; font-weight: bold;">Create helper function to test different modelsets
</span>

In [17]:
def model_tester(features):
    #create DataFrames for feature lists
    ratingdf = yelp['stars']
    featuresdf = yelp[features]
    
    #split data into train and test sets
    feature_train, feature_test, rating_train, rating_test = train_test_split(featuresdf, ratingdf, test_size=.2, random_state = 1)
    #create and fit model
    model = LinearRegression()
    model.fit(feature_train, rating_train)
    #print features and coefficients
    print(features)
    print(model.coef_)
    #print out Test Scores
    print('Training Score: ', model.score(feature_train, rating_train))
    print('Testing Score', model.score(feature_test, rating_test))

<span style="font-size: 13px; font-weight: bold;">Eliminate low coeffient features to improve model accuracy
</span>

In [18]:
#Elimated : number_funny_votes, weekday_checkins, weekend_checkins
first_test_features = ['alcohol?', 'good_for_kids', 'has_bike_parking', 'has_wifi',
       'price_range', 'review_count', 'take_reservations',
       'takes_credit_cards', 'average_review_age', 'average_review_length',
       'average_review_sentiment', 'number_cool_votes',
       'number_useful_votes', 'average_number_friends', 'average_days_on_yelp',
       'average_number_fans', 'average_review_count',
       'average_number_years_elite', 'average_tip_length', 'number_tips', 'average_caption_length',
       'number_pics']

In [19]:
model_tester(first_test_features)

['alcohol?', 'good_for_kids', 'has_bike_parking', 'has_wifi', 'price_range', 'review_count', 'take_reservations', 'takes_credit_cards', 'average_review_age', 'average_review_length', 'average_review_sentiment', 'number_cool_votes', 'number_useful_votes', 'average_number_friends', 'average_days_on_yelp', 'average_number_fans', 'average_review_count', 'average_number_years_elite', 'average_tip_length', 'number_tips', 'average_caption_length', 'number_pics']
[-1.49762019e-01 -1.16993057e-01  2.77349515e-02 -1.21672012e-01
 -6.51187302e-02  5.16426776e-05  1.43982195e-02  2.49631542e-02
 -1.57776352e-04 -5.90455991e-04  2.28123729e+00  9.28917625e-04
 -2.28959672e-04  2.14035921e-05  1.23485920e-04  1.02888512e-03
 -2.34188655e-04 -6.29747782e-02 -4.17623773e-04 -1.08106632e-03
 -6.05003168e-04 -1.24351879e-03]
Training Score:  0.6807270659310503
Testing Score 0.6781672353208771


<span style="font-size: 13px; font-weight: bold;">Our best model accuracy was with all quantifiable features, with a R2 value of 0.678212904586925, just below .7 threshhold showing relatively accurate predictions from our model.
</span>