In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
DATA_PATH = Path() / "data"
DATA_PATH.mkdir(parents=True,exist_ok=True)

def load_data(filename, data_path=DATA_PATH,encoding='ISO-8859-1'):
    csv_path = data_path / filename
    return pd.read_csv(csv_path,encoding=encoding)

def save_data(data, filename, data_path=DATA_PATH):
    csv_path = data_path / filename
    data.to_csv(csv_path, index=False)


In [3]:
PLOT_PATH = Path() / "plot"
PLOT_PATH.mkdir(parents=True,exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = PLOT_PATH / f"{fig_id}.{fig_extension}"
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [4]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
import re
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yanhuanhuang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yanhuanhuang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
# Download NLTK resources (stopwords and tokenizer)
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stemmer
stemmer = PorterStemmer()

# Get English stop words
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yanhuanhuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yanhuanhuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# stop_words = {
#     'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your',
#     'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it',
#     "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this',
#     'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
#     'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
#     'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',
#     'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
#     'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
#     'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
#     'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
#     'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't",
#     'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
#     "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
# }

In [7]:
# def pre_process(text):
    
#     # lowercase
#     text=str(text).lower()

#     # remove numbers followed by dot (like, "1.", "2.", etc)
#     text=re.sub('((\d+)[\.])', '', text)
    
#     #remove tags
#     text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
#     # correct some misspellings and/or replace some text with others that might be easier to handle
#     text=text.replace('do not', "don't")
    
#     # remove special characters except spaces, apostrophes and dots
#     text=re.sub(r"[^a-zA-Z0-9.']+", ' ', text)
    
#     # remove stopwords
#     text=[word for word in text.split(' ') if word not in stop_words]

#     # Basic tokenization by splitting the text
#     tokens = text.split()

#     # Remove stop words and apply stemming (rudimentary by chopping off common suffixes)
#     filtered_tokens = [word if not word.endswith(('ing', 'ly', 'ed', 's')) else word[:-2] for word in tokens if word not in stop_words]
    
#     # lemmatize
#     lmtzr = WordNetLemmatizer()
#     text = ' '.join((lmtzr.lemmatize(i)) for i in filtered_tokens)
    
#     return text

In [8]:

def pre_process(text):
    # Convert to lowercase
    text = str(text).lower()

    # Remove numbers followed by dots
    text = re.sub(r'(\d+\.)', '', text)
    
    # Remove HTML-like tags
    text = re.sub("&lt;/?.*?&gt;", " ", text)
    
    # Replace specific text patterns
    text = text.replace('do not', "don't")
    
    # Remove special characters, keeping spaces, apostrophes, and dots
    text = re.sub(r"[^a-zA-Z0-9.']+", ' ', text)
    
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Re-join tokens into a single string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text


In [9]:
data = load_data("McDonald_s_Reviews.csv")
data.head()

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [13]:
data.dtypes

reviewer_id                 int64
store_name                 object
category                   object
store_address              object
latitude                  float64
longitude                 float64
rating_count               object
review_time                object
review                     object
rating                     object
processed_review_basic     object
rating_numeric              int64
dtype: object

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33396 entries, 0 to 33395
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   reviewer_id             33396 non-null  int64  
 1   store_name              33396 non-null  object 
 2   category                33396 non-null  object 
 3   store_address           33396 non-null  object 
 4   latitude                32736 non-null  float64
 5   longitude               32736 non-null  float64
 6   rating_count            33396 non-null  object 
 7   review_time             33396 non-null  object 
 8   review                  33396 non-null  object 
 9   rating                  33396 non-null  object 
 10  processed_review_basic  33396 non-null  object 
 11  rating_numeric          33396 non-null  int64  
dtypes: float64(2), int64(2), object(8)
memory usage: 3.1+ MB


In [15]:
data[data.duplicated()].shape

(0, 12)

In [41]:
null_mask = data.isnull()
null_rows = data[null_mask.any(axis=1)]
null_rows

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,processed_review_basic,rating_numeric
22141,22142,McDonald's,Fast food restaurant,2476 Kalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿...,,,2175,3 months ago,Breakfast specials are good. The sausage burri...,4 stars,breakfast special good. sausage burrito sausag...,4
22142,22143,McDonald's,Fast food restaurant,2476 Kalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿...,,,2175,a year ago,This isn't your typical McDonald's. This place...,5 stars,typical mcdonald's. place located waikea beach...,5
22143,22144,McDonald's,Fast food restaurant,2476 Kalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿...,,,2175,2 weeks ago,This place was serving good quality breakfast ...,4 stars,place serving good quality breakfast item good...,4
22144,22145,McDonald's,Fast food restaurant,2476 Kalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿...,,,2175,a month ago,I understand this is a very busy location but ...,1 star,understand busy location time bee pie 2week pe...,1
22145,22146,McDonald's,Fast food restaurant,2476 Kalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿...,,,2175,2 months ago,"When I arrived at McDonald's, it was very crow...",4 stars,arrived mcdonald's crowded little le clean. al...,4
...,...,...,...,...,...,...,...,...,...,...,...,...
27719,27720,ýýýMcDonald's,Fast food restaurant,2476 Kalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿...,,,2175,3 years ago,This McDonald's is across the street from Waik...,5 stars,mcdonald's across street waikiki beach. forget...,5
27720,27721,McDonald's,Fast food restaurant,2476 Kalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿...,,,2175,11 months ago,"Seems like, they always makes some mistakes wh...",2 stars,seems like always make mistake get busy,2
27721,27722,McDonald's,Fast food restaurant,2476 Kalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿...,,,2175,11 months ago,Convenient to the east end of Kalakaua Ave. Lo...,4 stars,convenient east end kalakaua ave. long line mo...,4
27722,27723,McDonald's,Fast food restaurant,2476 Kalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿...,,,2175,11 months ago,"Lost McDonald's in Honolulu, if you can avoid ...",1 star,lost mcdonald's honolulu avoid going would. st...,1


In [42]:
null_rows["store_address"].unique()

array(['2476 Kalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½'],
      dtype=object)

In [43]:
null_rows["review"][27719]

"This McDonald's is across the street from Waikiki Beach. Don't forget to try the Hawaii only meals with Spam, rice and eggs! Prices are slightly higher than other McDonald's but A LOT cheaper than surrounding restaurants! Enjoy!"

In [44]:
null_rows["store_address"] = "2476 Kalakaua Ave, Honolulu, HI 96815, United States"
null_rows

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows["store_address"] = "2476 Kalakaua Ave, Honolulu, HI 96815, United States"


Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,processed_review_basic,rating_numeric
22141,22142,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,,2175,3 months ago,Breakfast specials are good. The sausage burri...,4 stars,breakfast special good. sausage burrito sausag...,4
22142,22143,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,,2175,a year ago,This isn't your typical McDonald's. This place...,5 stars,typical mcdonald's. place located waikea beach...,5
22143,22144,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,,2175,2 weeks ago,This place was serving good quality breakfast ...,4 stars,place serving good quality breakfast item good...,4
22144,22145,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,,2175,a month ago,I understand this is a very busy location but ...,1 star,understand busy location time bee pie 2week pe...,1
22145,22146,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,,2175,2 months ago,"When I arrived at McDonald's, it was very crow...",4 stars,arrived mcdonald's crowded little le clean. al...,4
...,...,...,...,...,...,...,...,...,...,...,...,...
27719,27720,ýýýMcDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,,2175,3 years ago,This McDonald's is across the street from Waik...,5 stars,mcdonald's across street waikiki beach. forget...,5
27720,27721,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,,2175,11 months ago,"Seems like, they always makes some mistakes wh...",2 stars,seems like always make mistake get busy,2
27721,27722,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,,2175,11 months ago,Convenient to the east end of Kalakaua Ave. Lo...,4 stars,convenient east end kalakaua ave. long line mo...,4
27722,27723,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,,2175,11 months ago,"Lost McDonald's in Honolulu, if you can avoid ...",1 star,lost mcdonald's honolulu avoid going would. st...,1


In [45]:
import requests

def get_lat_lng(address, api_key):
    # Encode the address for URL
    address = requests.utils.quote(address)
    
    # Construct the API URL
    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={api_key}"
    
    # Send the GET request
    response = requests.get(url)
    
    # Parse the response JSON
    json_response = response.json()
    
    if json_response.get("status") == "OK":
        # Extract latitude and longitude
        lat = json_response["results"][0]["geometry"]["location"]["lat"]
        lng = json_response["results"][0]["geometry"]["location"]["lng"]
        return lat, lng
    else:
        return None, None

api_key = "AIzaSyDxX-C8Gm6LmYkf5om-zXelcfInwU33bYs" 
address = "2476 Kalakaua Ave, Honolulu, HI 96815, United States"
latitude, longitude = get_lat_lng(address, api_key)
print(f"Latitude: {latitude}, Longitude: {longitude}")

Latitude: 21.2746579, Longitude: -157.8241212


In [46]:
null_rows["latitude_new"] = latitude
null_rows["longitude_new"] = longitude
null_rows = null_rows.drop(columns = ["latitude", "longitude"])
null_rows = null_rows.rename({"latitude_new": "latitude", "longitude_new": "longitude"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows["latitude"] = latitude
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_rows["longitude"] = longitude


Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,processed_review_basic,rating_numeric,latitude.1
22141,22142,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,-157.824121,2175,3 months ago,Breakfast specials are good. The sausage burri...,4 stars,breakfast special good. sausage burrito sausag...,4,21.274658
22142,22143,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,-157.824121,2175,a year ago,This isn't your typical McDonald's. This place...,5 stars,typical mcdonald's. place located waikea beach...,5,21.274658
22143,22144,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,-157.824121,2175,2 weeks ago,This place was serving good quality breakfast ...,4 stars,place serving good quality breakfast item good...,4,21.274658
22144,22145,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,-157.824121,2175,a month ago,I understand this is a very busy location but ...,1 star,understand busy location time bee pie 2week pe...,1,21.274658
22145,22146,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,-157.824121,2175,2 months ago,"When I arrived at McDonald's, it was very crow...",4 stars,arrived mcdonald's crowded little le clean. al...,4,21.274658
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27719,27720,ýýýMcDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,-157.824121,2175,3 years ago,This McDonald's is across the street from Waik...,5 stars,mcdonald's across street waikiki beach. forget...,5,21.274658
27720,27721,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,-157.824121,2175,11 months ago,"Seems like, they always makes some mistakes wh...",2 stars,seems like always make mistake get busy,2,21.274658
27721,27722,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,-157.824121,2175,11 months ago,Convenient to the east end of Kalakaua Ave. Lo...,4 stars,convenient east end kalakaua ave. long line mo...,4,21.274658
27722,27723,McDonald's,Fast food restaurant,"2476 Kalakaua Ave, Honolulu, HI 96815, United ...",,-157.824121,2175,11 months ago,"Lost McDonald's in Honolulu, if you can avoid ...",1 star,lost mcdonald's honolulu avoid going would. st...,1,21.274658


In [48]:
cleaned_data = data.dropna()
cleaned_data

reviewer_id                 int64
store_name                 object
category                   object
store_address              object
latitude                  float64
longitude                 float64
rating_count               object
review_time                object
review                     object
rating                     object
processed_review_basic     object
rating_numeric              int64
latitude                  float64
dtype: object

In [10]:
# Apply the basic preprocessing to the review column
data['processed_review_basic'] = data['review'].apply(pre_process)

# Display the first few rows of the dataset to show the processed reviews
data[['review', 'processed_review_basic']].head()

Unnamed: 0,review,processed_review_basic
0,Why does it look like someone spit on my food?...,look like someone spit food normal transaction...
1,It'd McDonalds. It is what it is as far as the...,it'd mcdonalds. far food atmosphere go. staff ...
2,Made a mobile order got to the speaker and che...,made mobile order got speaker checked in. line...
3,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,mc. crispy chicken sandwich customer service q...
4,"I repeat my order 3 times in the drive thru, a...",repeat order 3 time drive thru still manage me...


In [11]:
# data["review"][3]

In [12]:
data['rating_numeric'] = data['rating'].apply(lambda x: int(x.split()[0]) if pd.notnull(x) and x.split()[0].isdigit() else None)
data['rating_numeric'].value_counts().sort_index()

rating_numeric
1     9431
2     3086
3     4818
4     5787
5    10274
Name: count, dtype: int64

In [None]:
import matplotlib.pyplot as plt

# Plotting the distribution of numeric ratings
data['rating_numeric'].value_counts().sort_index().plot(kind='bar', color='skyblue', figsize=(10, 6))

plt.title('Distribution of McDonald\'s Ratings')
plt.xlabel('Rating (Stars)')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--')

plt.show()

In [None]:
# Total number of distinct stores
len(data["store_address"].value_counts())

In [None]:
# Review counts per store
import seaborn as sns
plt.figure(figsize=(14, 8))
reviews_per_store = data['store_address'].value_counts().head(10)  # Top 10 stores by review count
sns.barplot(x=reviews_per_store.values, y=reviews_per_store.index)
plt.title('Top 10 Stores by Review Count')
plt.xlabel('Review Count')
plt.ylabel('Store Address')

plt.show()

In [None]:
from datetime import datetime, timedelta
current_date = datetime.now()
def parse_review_time_updated(review_time):
    review_time = review_time.replace('a ', '1 ').replace('one ', '1 ')

    if 'years' in review_time or 'year' in review_time:
        years = int(review_time.split()[0])
        return current_date - timedelta(days=365*years)
    elif 'months' in review_time:
        months = int(review_time.split()[0])
        return current_date - timedelta(days=30*months) 
    elif 'month' in review_time:
        return current_date - timedelta(days=30)  # "1 month ago" 近似为30天
    elif 'days' in review_time or 'day' in review_time:
        days = int(review_time.split()[0])
        return current_date - timedelta(days=days)
    else:
        return current_date


In [None]:
data['approx_review_date'] = data['review_time'].apply(parse_review_time_updated)
data['approx_review_date'] = data['approx_review_date'].dt.date
daily_reviews_count = data.groupby('approx_review_date').size()

plt.figure(figsize=(14, 7))
daily_reviews_count.plot(kind='line', color='blue', marker='o', linestyle='-')
plt.title('Daily Review Counts Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Reviews')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()  # Adjust layout to not cut off labels
plt.show()

In [None]:
# from datetime import datetime, timedelta
# import pandas as pd
# import matplotlib.pyplot as plt

# # 假设 reviews_df 已经加载到 DataFrame 中
# # 定义解析评论时间字符串的函数
# def parse_review_time(review_time):
#     current_date = datetime.now()
#     review_time = review_time.replace('a ', '1 ').replace('one ', '1 ')
    
#     if 'years' in review_time or 'year' in review_time:
#         years = int(review_time.split()[0])
#         return current_date - timedelta(days=365 * years)
#     elif 'months' in review_time:
#         months = int(review_time.split()[0])
#         return current_date - timedelta(days=30 * months)
#     elif 'month' in review_time:
#         return current_date - timedelta(days=30)
#     elif 'days' in review_time or 'day' in review_time:
#         days = int(review_time.split()[0])
#         return current_date - timedelta(days=days)
#     else:
#         return current_date

# 应用函数转换评论时间到具体日期
data['approx_review_date'] = data['review_time'].apply(parse_review_time_updated)

# 设置转换后的日期为 DataFrame 的索引
data.set_index('approx_review_date', inplace=True)

# 使用 resample 方法以半年为单位聚合评论数量
semi_annual_reviews_count = data.resample('6M').size()

# 绘制评论数量随时间变化的图表
plt.figure(figsize=(14, 7))
semi_annual_reviews_count.plot(kind='line', color='blue', marker='o', linestyle='-')
plt.title('Semi-Annual Review Counts Over Time')
plt.xlabel('Semi-Annual Period')
plt.ylabel('Number of Reviews')
plt.grid(True)

# 设置X轴标签为详细日期
x_ticks_labels = semi_annual_reviews_count.index.strftime('%Y-%m')
plt.xticks(ticks=semi_annual_reviews_count.index, labels=x_ticks_labels, rotation=45)
plt.tight_layout()  # Adjust layout to not cut off labels
plt.show()



In [None]:
pip install matplotlib basemap

In [None]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

# 初始化地图
plt.figure(figsize=(10, 8))
m = Basemap(projection='merc', llcrnrlat=24.396308, urcrnrlat=49.384358, llcrnrlon=-125.755837, urcrnrlon=-66.93457, lat_ts=20, resolution='i')
m.drawcoastlines()
m.drawcountries()
m.fillcontinents(color='lightgray', lake_color='lightblue')
m.drawmapboundary(fill_color='lightblue')

# 绘制经纬度数据为地图上的点
# 假设 reviews_df 包含您的经纬度数据
x, y = m(data['longitude'].values, data['latitude '].values)
m.scatter(x, y, marker='o', color='red', zorder=5)

plt.title('Geographical Locations of Reviews')
plt.show()


In [None]:
# pip install IPython

In [None]:
json_data = data.to_json("", orient="records")
# data.to_json('data/geo.json', orient='records')
# json_data

In [None]:
%%javascript

// Use this cell to run JavaScript directly
element.append("JavaScript execution supported!");


In [None]:
# Leaflet Maps
from IPython.display import HTML

html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Geographical Locations of Reviews</title>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <link rel="stylesheet" href="https://unpkg.com/leaflet/dist/leaflet.css" />
</head>
<body>
    <div id="map" style="width: 100%; height: 800px;"></div>
    <script src="https://unpkg.com/leaflet/dist/leaflet.js"></script>
    <script>
        // Initialize the map
        var map = L.map('map').setView([37.0902, -95.7129], 4); // Centered on the US

        // Set up the OSM layer
        L.tileLayer('https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', {
            maxZoom: 19,
            attribution: '© OpenStreetMap contributors'
        }).addTo(map);


        json_data.forEach(function(item) {
            L.marker([item.latitude , item.longitude]).addTo(map) // Fixed marker creation
                .bindPopup('A review location');
            });
        })
        .catch(error => console.error('Error loading the JSON data:', error));
    </script>
</body>
</html>

"""

# Display the HTML content
HTML(html_content)