In [2]:
import pickle

def load_collection_data_frames(filename='collection_data_frames.pkl'):
    """
    Loads the pickled DataFrame from the specified file.

    :param filename: Path to the pickle file.
    :return: Loaded Pandas DataFrame.
    """
    try:
        with open(filename, 'rb') as file:
            data_frames = pickle.load(file)
        print(f"Data successfully loaded from {filename}")
        return data_frames
    except Exception as e:
        print(f"Error loading pickle file: {e}")
        raise e
    
location_data_frames = load_collection_data_frames("location_data_frames.pkl")


Data successfully loaded from location_data_frames.pkl


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import ast

# Load the data
df = location_data_frames

# Preprocessing
def safe_len(x):
    try:
        if isinstance(x, str):
            return len(ast.literal_eval(x))
        elif isinstance(x, list):
            return len(x)
        else:
            return 0
    except:
        return 0

df['reviews_count'] = df['reviews_text'].apply(safe_len)
df['has_website'] = df['website'].notna()
df['polarity'] = pd.to_numeric(df['polarity'], errors='coerce')

# Display basic information about the dataset
print(df.info())
print("\nSample of the data:")
print(df.head())

# Basic statistics
print("\nBasic statistics:")
print(df.describe())

# Correlation matrix
numeric_columns = ['lat', 'lng', 'polarity', 'reviews_count']
correlation_matrix = df[numeric_columns].corr()
print("\nCorrelation matrix:")
print(correlation_matrix)

# Statistical tests

# 1. T-test: Compare polarity scores of accommodations with and without websites
with_website = df[df['has_website']]['polarity'].dropna()
without_website = df[~df['has_website']]['polarity'].dropna()
if len(with_website) > 0 and len(without_website) > 0:
    t_stat, p_value = stats.ttest_ind(with_website, without_website)
    print(f"\nT-test results (polarity scores for accommodations with/without websites):")
    print(f"t-statistic: {t_stat}, p-value: {p_value}")
else:
    print("\nNot enough data to perform T-test.")

# 2. Chi-square test: Association between having a website and having reviews
contingency_table = pd.crosstab(df['has_website'], df['reviews_count'] > 0)
if contingency_table.size > 0:
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    print(f"\nChi-square test results (association between having a website and having reviews):")
    print(f"chi2-statistic: {chi2}, p-value: {p_value}")
else:
    print("\nNot enough data to perform Chi-square test.")

# Data Visualizations

# 1. Scatter plot of latitude vs longitude, colored by polarity
plt.figure(figsize=(10, 8))
scatter = plt.scatter(df['lng'], df['lat'], c=df['polarity'], cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label='Polarity')
plt.title('Accommodation Locations Colored by Polarity')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.savefig('location_polarity_scatter.png')
plt.close()

# 2. Box plot of polarity scores for accommodations with and without websites
plt.figure(figsize=(8, 6))
sns.boxplot(x='has_website', y='polarity', data=df)
plt.title('Polarity Scores for Accommodations With and Without Websites')
plt.savefig('website_polarity_boxplot.png')
plt.close()

# 3. Histogram of review counts
plt.figure(figsize=(10, 6))
plt.hist(df['reviews_count'], bins=20, edgecolor='black')
plt.title('Distribution of Review Counts')
plt.xlabel('Number of Reviews')
plt.ylabel('Frequency')
plt.savefig('review_count_histogram.png')
plt.close()

print("\nAnalysis complete. Visualizations saved as PNG files.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492880 entries, 0 to 492879
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   location                    492880 non-null  object 
 1   category                    492880 non-null  object 
 2   place_id                    492880 non-null  int64  
 3   place_name                  492880 non-null  object 
 4   reviews_text                492880 non-null  object 
 5   address                     411474 non-null  object 
 6   international_phone_number  260175 non-null  object 
 7   lat                         474222 non-null  float64
 8   lng                         474222 non-null  float64
 9   polarity                    200918 non-null  float64
 10  website                     214159 non-null  object 
 11  reviews_count               492880 non-null  int64  
 12  has_website                 492880 non-null  bool   
dtypes: bool(1), fl

In [20]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import folium
from folium.plugins import MarkerCluster

def map_location(df, location_name, max_locations=500):
    # Find the specified location
    location = df[df['place_name'] == location_name]
    
    if location.empty:
        return f"Location '{location_name}' not found."
    
    location = location.iloc[0]
    
    # Find nearby locations (within ~1km)
    nearby = df[
        (df['lat'].between(location['lat'] - 0.01, location['lat'] + 0.01)) &
        (df['lng'].between(location['lng'] - 0.01, location['lng'] + 0.01))
    ]
    
    # Limit to max_locations
    if len(nearby) > max_locations:
        nearby = nearby.sample(n=max_locations, random_state=42)
    
    # Create map
    m = folium.Map(location=[location['lat'], location['lng']], zoom_start=14)
    
    # Use MarkerCluster for better performance with many markers
    marker_cluster = MarkerCluster().add_to(m)
    
    # Add markers
    for _, place in nearby.iterrows():
        # Simple sentiment analysis
        reviews = eval(place['reviews_text']) if isinstance(place['reviews_text'], str) else place['reviews_text']
        sentiment = np.mean([TextBlob(review).sentiment.polarity for review in reviews]) if reviews else 0
        
        # Color based on sentiment (-1 to 1 scale)
        color = f'#{int(255 * (1 - sentiment))//2:02x}{int(255 * (1 + sentiment))//2:02x}00'
        
        folium.CircleMarker(
            location=[place['lat'], place['lng']],
            radius=5,
            popup=f"{place['place_name']}<br>Sentiment: {sentiment:.2f}",
            color=color,
            fill=True,
            fillColor=color
        ).add_to(marker_cluster)
    
    # Save map
    m.save(f"{location_name}_map.html")
    
    return f"Map created for {location_name} with {len(nearby)} nearby locations (max 500). Saved as '{location_name}_map.html'."

# Example usage
location_name = "Osteria dei Pazzi"
result = map_location(location_data_frames, location_name)
print(result)

Map created for Osteria dei Pazzi with 500 nearby locations (max 500). Saved as 'Osteria dei Pazzi_map.html'.


In [11]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
import re

class EnhancedTravelChatbot:
    def __init__(self, data):
        self.df = data
        self.preprocess_data()
        self.analyze_data()
    
    def preprocess_data(self):
        # Convert reviews to list if it's a string
        self.df['reviews_list'] = self.df['reviews_text'].apply(lambda x: eval(x) if isinstance(x, str) else x)
        
        # Calculate average polarity of reviews
        self.df['avg_polarity'] = self.df['reviews_list'].apply(self.get_average_sentiment)
        
        # Count number of reviews
        self.df['review_count'] = self.df['reviews_list'].apply(len)

    def analyze_data(self):
        # Perform K-means clustering on locations
        coords = self.df[['lat', 'lng']].values
        kmeans = KMeans(n_clusters=5, random_state=42)
        self.df['cluster'] = kmeans.fit_predict(coords)

        # Calculate price ranges (assuming you have a 'price' column)
        if 'price' in self.df.columns:
            self.df['price_category'] = pd.qcut(self.df['price'], q=3, labels=['Budget', 'Mid-range', 'Luxury'])

    def get_average_sentiment(self, reviews):
        if not reviews:
            return 0
        sentiments = [TextBlob(review).sentiment.polarity for review in reviews]
        return np.mean(sentiments)

    def find_best_places(self, city, top_n=5):
        city_df = self.df[self.df['location'] == city]
        return city_df.nlargest(top_n, 'avg_polarity')

    def find_places_in_range(self, min_lat, max_lat, min_lng, max_lng):
        return self.df[
            (self.df['lat'] >= min_lat) & (self.df['lat'] <= max_lat) &
            (self.df['lng'] >= min_lng) & (self.df['lng'] <= max_lng)
        ]

    def get_popular_amenities(self, city):
        city_df = self.df[self.df['location'] == city]
        all_amenities = [amenity for amenities in city_df['amenities'].dropna() for amenity in amenities.split(',')]
        return pd.Series(all_amenities).value_counts().head(10)

    def recommend_similar_places(self, place_id):
        place = self.df[self.df['place_id'] == place_id].iloc[0]
        cluster = place['cluster']
        similar_places = self.df[
            (self.df['cluster'] == cluster) & 
            (self.df['place_id'] != place_id)
        ].sort_values('avg_polarity', ascending=False)
        return similar_places.head(5)

    def answer_question(self, question):
        if "best places in" in question.lower():
            city = re.search(r"best places in (.+)", question, re.IGNORECASE).group(1)
            best_places = self.find_best_places(city)
            return f"The top 5 places in {city} based on reviews are:\n" + \
                   "\n".join([f"{i+1}. {row['place_name']} (Avg. Polarity: {row['avg_polarity']:.2f})" 
                              for i, (_, row) in enumerate(best_places.iterrows())])

        elif "between" in question.lower() and "latitude" in question.lower() and "longitude" in question.lower():
            coords = re.findall(r"(-?\d+\.?\d*)", question)
            if len(coords) == 4:
                places = self.find_places_in_range(*map(float, coords))
                return f"Found {len(places)} places in the specified range. Here are the top 5:\n" + \
                       "\n".join([f"{i+1}. {row['place_name']} ({row['lat']}, {row['lng']})" 
                                  for i, (_, row) in enumerate(places.head().iterrows())])

        elif "popular amenities" in question.lower():
            city = re.search(r"popular amenities in (.+)", question, re.IGNORECASE).group(1)
            amenities = self.get_popular_amenities(city)
            return f"The most popular amenities in {city} are:\n" + \
                   "\n".join([f"{i+1}. {amenity} ({count} places)" for i, (amenity, count) in enumerate(amenities.items())])

        elif "similar to" in question.lower():
            place_name = re.search(r"similar to (.+)", question, re.IGNORECASE).group(1)
            place_id = self.df[self.df['place_name'] == place_name]['place_id'].iloc[0]
            similar_places = self.recommend_similar_places(place_id)
            return f"Places similar to {place_name} are:\n" + \
                   "\n".join([f"{i+1}. {row['place_name']} (Avg. Polarity: {row['avg_polarity']:.2f})" 
                              for i, (_, row) in enumerate(similar_places.iterrows())])

        else:
            return "I'm sorry, I don't understand that question. Can you please rephrase it?"

# Usage
chatbot = EnhancedTravelChatbot(location_data_frames)
response = chatbot.answer_question("What are the best places in Rome?")
print(response)

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values