# Smart Travel App
## Notebook 4: Machine Learning - All Data

**Group 4, Project 4**  
Team: Dominique Villarreal, Enrique Garcia, Jose Santos  
Project Due Date: June 12, 2023


### Load Environment

In [43]:
import json
import os
import pandas as pd
import pprint as pp
import random
import requests
import time
import warnings
warnings.simplefilter("ignore")
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

### Import Data

In [44]:
# Import merged_df.csv from notebook 3
merged_file_path = '../G4_Project4/Resources/df_Extractions/merged_df.csv'
merged_df = pd.read_csv(merged_file_path)

# View df
merged_df.head(2)

Unnamed: 0,categories,name,rating,review_count,location,coordinates,city
0,"['Festivals', 'Arts & Crafts', 'Local Flavor']",Jackalope Arts,5.0,8,"{'address1': '5738 Olde Wadsworth Blvd', 'addr...","{'latitude': 39.800724136818026, 'longitude': ...",Denver
1,"['Venues & Event Spaces', 'Party & Event Plann...",Special Occasions Events,5.0,4,"{'address1': '3550 Federal Blvd', 'address2': ...","{'latitude': 39.76623, 'longitude': -105.02439}",Denver


In [45]:
# Process & Organize merged_df
# Remove columns we don't need for prediction process
columns_to_delete = ['location', 'coordinates']
merged_df.drop(columns_to_delete, axis=1, inplace=True)

# Arrange columns
merged_df = merged_df[['city','name', 'rating', 
                       'review_count', 'categories']]

# View updated df
merged_df.head(1)

Unnamed: 0,city,name,rating,review_count,categories
0,Denver,Jackalope Arts,5.0,8,"['Festivals', 'Arts & Crafts', 'Local Flavor']"


In [46]:
# Confirm merged_df information
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19627 entries, 0 to 19626
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   city          19627 non-null  object 
 1   name          19627 non-null  object 
 2   rating        19627 non-null  float64
 3   review_count  19627 non-null  int64  
 4   categories    19627 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 766.8+ KB


### Process Data

In [47]:
# Begin work for predicting city
# Create city_encoded_data df by copying merged_df
city_encoded_data = merged_df.copy()

# View df
city_encoded_data

Unnamed: 0,city,name,rating,review_count,categories
0,Denver,Jackalope Arts,5.0,8,"['Festivals', 'Arts & Crafts', 'Local Flavor']"
1,Denver,Special Occasions Events,5.0,4,"['Venues & Event Spaces', 'Party & Event Plann..."
2,Denver,Denver International Wine Festival,4.5,6,['Festivals']
3,Denver,"Denver Mineral, Fossil, Gem & Jewelry Show",4.5,10,['Festivals']
4,Denver,Colorado Festival of Horror,5.0,1,['Festivals']
...,...,...,...,...,...
19622,NewYork,Felice 56,4.0,94,"['Italian', 'Wine Bars']"
19623,NewYork,The Jeffrey Craft Beer & Bites,4.0,728,"['Bars', 'American (New)', 'Coffee & Tea']"
19624,NewYork,Saint Tuesday,4.0,30,"['Lounges', 'Speakeasies', 'Cocktail Bars']"
19625,NewYork,Harlem Nights Bar,4.0,185,"['Bars', 'Music Venues']"


In [48]:
# Explode the 'categories' column to create a row for each category listed in categories column
city_encoded_data['categories'] = city_encoded_data['categories'].apply(eval)
city_encoded_data = city_encoded_data.explode('categories')

# View df
city_encoded_data

Unnamed: 0,city,name,rating,review_count,categories
0,Denver,Jackalope Arts,5.0,8,Festivals
0,Denver,Jackalope Arts,5.0,8,Arts & Crafts
0,Denver,Jackalope Arts,5.0,8,Local Flavor
1,Denver,Special Occasions Events,5.0,4,Venues & Event Spaces
1,Denver,Special Occasions Events,5.0,4,Party & Event Planning
...,...,...,...,...,...
19625,NewYork,Harlem Nights Bar,4.0,185,Bars
19625,NewYork,Harlem Nights Bar,4.0,185,Music Venues
19626,NewYork,Cardiff Giant,4.0,56,Bars
19626,NewYork,Cardiff Giant,4.0,56,"Beer, Wine & Spirits"


In [49]:
# Categories - labelEncoder to encode categories and use in machine learning 
# Create a label encoder
label_encoder = LabelEncoder()

# Apply label encoding to the 'categories' column
city_encoded_data['categories_encoded'] = label_encoder.fit_transform(city_encoded_data['categories'])

city_encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44009 entries, 0 to 19626
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   city                44009 non-null  object 
 1   name                44009 non-null  object 
 2   rating              44009 non-null  float64
 3   review_count        44009 non-null  int64  
 4   categories          44008 non-null  object 
 5   categories_encoded  44009 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 2.4+ MB


In [50]:
# View table
city_encoded_data.head()

Unnamed: 0,city,name,rating,review_count,categories,categories_encoded
0,Denver,Jackalope Arts,5.0,8,Festivals,220
0,Denver,Jackalope Arts,5.0,8,Arts & Crafts,38
0,Denver,Jackalope Arts,5.0,8,Local Flavor,348
1,Denver,Special Occasions Events,5.0,4,Venues & Event Spaces,605
1,Denver,Special Occasions Events,5.0,4,Party & Event Planning,416


### Output Data

In [51]:
# Write out city_encoded_data to csv
pathname = '../G4_Project4/Resources/df_Extractions/city_encoded_data.csv'
city_encoded_data.to_csv(pathname, index=False)

In [52]:
# Create city_category_predict_df to avoid breaking city_encoded_data
city_category_predict_df = city_encoded_data[['city', 'name', 'rating', 'review_count', 'categories','categories_encoded']]

# View df
city_category_predict_df

Unnamed: 0,city,name,rating,review_count,categories,categories_encoded
0,Denver,Jackalope Arts,5.0,8,Festivals,220
0,Denver,Jackalope Arts,5.0,8,Arts & Crafts,38
0,Denver,Jackalope Arts,5.0,8,Local Flavor,348
1,Denver,Special Occasions Events,5.0,4,Venues & Event Spaces,605
1,Denver,Special Occasions Events,5.0,4,Party & Event Planning,416
...,...,...,...,...,...,...
19625,NewYork,Harlem Nights Bar,4.0,185,Bars,58
19625,NewYork,Harlem Nights Bar,4.0,185,Music Venues,384
19626,NewYork,Cardiff Giant,4.0,56,Bars,58
19626,NewYork,Cardiff Giant,4.0,56,"Beer, Wine & Spirits",70


### Random Forest ML Model - All Data

### Model Creation

In [53]:
# select X & y and reshape df
X = city_category_predict_df['categories_encoded'].values.reshape(-1, 1)
y = city_category_predict_df['city']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# View model
model

### Train & Test Our Model

In [54]:
model.fit(X_train, y_train)
model.score(X_train, y_train)

0.5381600249950294

In [55]:
model.fit(X_test, y_test)

In [56]:
model.score(X_test, y_test)

0.5457850488525335

### Predict the City

In [57]:
# Generate a random sample of 25 categories. This process is mean to mimic anyone one persons 25 category selection
sample_categories = random.sample(list(city_category_predict_df['categories_encoded']), 25)

# Create df for random sample
sample_categories_encoded = pd.DataFrame({'categories_encoded': sample_categories})

# Assign the readable category names based on random sample of the 25 randomly selected categories above
category_names = city_category_predict_df.loc[city_category_predict_df['categories_encoded'].isin(sample_categories), 'categories'].unique()

In [58]:
# Print the readable categories being used to predict the city
print("Categories Used:")
for category in category_names:
    print(category)

Categories Used:
Barbeque
Music Venues
American (New)
Bars
Beer, Wine & Spirits
Cocktail Bars
Breakfast & Brunch
Tapas/Small Plates
Trainers
Fishing
Parks
Art Galleries
Batting Cages
Italian
Japanese
Ice Cream & Frozen Yogurt
Ramen
Cajun/Creole
Chicken Wings
Gastropubs
Hookah Bars
Beaches
Art Museums


In [59]:
# Predict the city based on the random group of 25 categories
prediction = model.predict(sample_categories_encoded)
predicted_city = prediction[0]
predicted_city

'Miami'

### List Experiences Within Predicted City

In [60]:
# List top 3 experiences for each category within predicted city 
# Set to keep track of predicted cities and their activities
predicted_data = []

# Set to keep track of unique experiences
unique_experiences = set()

# Get the top activities for each sample category
for category in sample_categories:
    # Convert category back to its name
    category_name = city_category_predict_df.loc[city_category_predict_df['categories_encoded'] == category, 'categories'].iloc[0]

    filtered_activities = city_category_predict_df[(city_category_predict_df['city'] == predicted_city) & (city_category_predict_df['categories_encoded'] == category)]
    sorted_activities = filtered_activities.sort_values(['rating', 'review_count'], ascending=[False, False])
    top_activities = sorted_activities[['name', 'rating', 'review_count']].head(3)

    # Create a new activities_data list for each category iteration
    activities_data = []

    for _, activity in top_activities.iterrows():
        activity_name = activity['name']
        rating = activity['rating']
        review_count = activity['review_count']
                
        # Create a unique experience key
        experience_key = (category_name, predicted_city, activity_name)

        # Check if the experience is unique
        if experience_key not in unique_experiences:
            # Add the experience to the set of unique experiences
            unique_experiences.add(experience_key)

            # Append the predicted activity to the list
            activities_data.append({
                'Sample Category': category_name,
                'Predicted City': predicted_city,
                'Experience Name': activity_name,
                'Rating': rating,
                'Review Count': review_count
            })

    # Append the activities_data to the predicted_data list for each category
    predicted_data.extend(activities_data)

# Create a DataFrame from the predicted data
predicted_df = pd.DataFrame(predicted_data)

# Print the final predicted data DataFrame
print("Predicted user experiences based on predicted city:")
predicted_df

Predicted user experiences based on predicted city:


Unnamed: 0,Sample Category,Predicted City,Experience Name,Rating,Review Count
0,Batting Cages,Miami,Swing Kings,5.0,1
1,Batting Cages,Miami,AllGolf at CB Smith,3.5,83
2,Batting Cages,Miami,Batter's Box - Miami,2.5,15
3,Hookah Bars,Miami,Red Buddha Bar & Hookah Lounge,5.0,20
4,Hookah Bars,Miami,SoBe Hookah Lounge At Zoi,5.0,6
5,Hookah Bars,Miami,Sports Tobacco And More,5.0,6
6,"Beer, Wine & Spirits",Miami,Primo Liquors Fine Wine and Cigars,5.0,106
7,Italian,Miami,Fratellino,5.0,1589
8,Italian,Miami,Mimmo's Mozzarella Italian Market,5.0,154
9,Italian,Miami,Rosalia's Kitchen,5.0,113


### Output Predictions to csv

In [61]:
# Write the DataFrame to a CSV file
pathname = '../G4_Project4/Resources/Insights/predicted_df.csv'
predicted_df.to_csv(pathname, index=False)

### Proceed to G4_P4_Notebook5_MLvSelectedData