In [None]:
import pandas as pd
from google.colab import drive
import spacy
import numpy as np

In [None]:
# mount drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_reviews = pd.read_csv("drive/My Drive/Colab Notebooks/TCC/dataset_completed.csv")
df_attractions = pd.read_csv("drive/My Drive/Colab Notebooks/TCC/attractions_list.csv")

In [None]:
df_reviews_florida = df_reviews[df_reviews["branch"] == "Universal Studios Florida"]
df_reviews_singapore = df_reviews[df_reviews["branch"] == "Universal Studios Singapore"]
df_reviews_japan = df_reviews[df_reviews["branch"] == "Universal Studios Japan"]

df_attractions_florida = df_attractions[df_attractions["branch"] == "Universal Studios Florida"]
df_attractions_singapore = df_attractions[df_attractions["branch"] == "Universal Studios Singapore"]
df_attractions_japan = df_attractions[df_attractions["branch"] == "Universal Studios Japan"]

In [None]:
lemma = spacy.load('en_core_web_sm')

stop_words = ["attraction","park", "time", "guest", "ride", "studios", "coaster", 
              "food", "universal", "orlando", "express", "adventure", "trip", 
              "amazing", "studio", "fun", "shop", "street", "bar", "water", 
              "movie", "space", "big", "little", "night", "day", "place", "street"]

In [None]:
#using nparray to decrease execution time
def get_attractions(df_reviews, df_attractions):
  attractions = []
  for index in np.nditer(df_reviews["tokens"].values, order="C", flags=["refs_ok"]):
    tokens = index.tolist()
    tokens_attractions = []
    for attraction in np.nditer(df_attractions["name"].values, order="C", flags=["refs_ok"]):
      if attraction not in tokens_attractions:
        splitted = str(attraction).split()
        for word in splitted:
          if not lemma.vocab[word].is_stop and lemma.vocab[word].is_alpha and word not in stop_words and str(word) in tokens:
            #print(f'attraction found {attraction} with word {word} in token {tokens}')
            tokens_attractions.append(attraction)
            #print(f'size of tokens attractions {len(tokens_attractions)}')
    attractions.append(tokens_attractions)
  df_reviews["attractions"] = attractions
  return df_reviews

In [None]:
df_florida = get_attractions(df_reviews_florida, df_attractions_florida)
df_singapore = get_attractions(df_reviews_singapore, df_attractions_singapore)
df_japan = get_attractions(df_reviews_japan, df_attractions_japan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [None]:
df_final_with_attractions = pd.concat([df_florida, df_singapore, df_japan])
df_final_with_attractions = df_final_with_attractions.reset_index()

In [None]:
df_final_with_attractions.to_csv("drive/My Drive/Colab Notebooks/TCC/dataset_completed_attractions.csv",index=False)

In [None]:
def to_1D(series):
 return pd.Series([x for _list in series for x in _list])

def get_count_values_dataset(dataset):
  counted_values = to_1D(dataset["attractions"]).value_counts()
  df_values = pd.DataFrame(counted_values)
  df_values = df_values.reset_index()
  df_values.columns = ["attractions", "count"]
  return df_values

def fill_attractions_count(df_attractions, df_reviews, column_name):
  df_values = get_count_values_dataset(df_reviews)
  attractions_count = []
  listed_attractions = []
  for attraction in np.nditer(df_values["attractions"].values, order="C", flags=["refs_ok"]):
    for name in np.nditer(df_attractions["name"].values, order="C", flags=["refs_ok"]):
      if name not in listed_attractions and name == attraction:
        found_attractions = df_values[df_values["attractions"] == name]
        attractions_count.append(found_attractions.iloc[0]["count"])
        listed_attractions.append(name)
  df_final = pd.DataFrame()
  df_final[f'{column_name}_name'] = listed_attractions
  df_final[column_name] = attractions_count
  return df_final

In [None]:
POSITIVE = 1
NEGATIVE = 0

full_attractions_count = fill_attractions_count(df_attractions, df_final_with_attractions, "full_count")
full_attractions_count_pos = fill_attractions_count(df_attractions, df_final_with_attractions[df_final_with_attractions["label"] == POSITIVE], "full_count_pos")
full_attractions_count_neg = fill_attractions_count(df_attractions, df_final_with_attractions[df_final_with_attractions["label"] == NEGATIVE], "full_count_neg")

florida_attractions_count = fill_attractions_count(df_attractions, df_florida, "florida_count")
florida_attractions_count_pos = fill_attractions_count(df_attractions, df_florida[df_florida["label"] == POSITIVE], "florida_count_pos")
florida_attractions_count_neg = fill_attractions_count(df_attractions, df_florida[df_florida["label"] == NEGATIVE], "florida_count_neg")

japan_attractions_count = fill_attractions_count(df_attractions, df_japan, "japan_count")
japan_attractions_count_pos = fill_attractions_count(df_attractions, df_japan[df_japan["label"] == POSITIVE], "japan_count_pos")
japan_attractions_count_neg = fill_attractions_count(df_attractions, df_japan[df_japan["label"] == NEGATIVE], "japan_count_neg")

singapore_attractions_count = fill_attractions_count(df_attractions, df_singapore, "singapore_count")
singapore_attractions_count_pos = fill_attractions_count(df_attractions, df_singapore[df_singapore["label"] == POSITIVE], "singapore_count_pos")
singapore_attractions_count_neg = fill_attractions_count(df_attractions, df_singapore[df_singapore["label"] == NEGATIVE], "singapore_count_neg")

In [None]:
df_final_attractions_count = pd.concat([full_attractions_count, 
                                        full_attractions_count_pos, 
                                        full_attractions_count_neg, 
                                        florida_attractions_count, 
                                        florida_attractions_count_pos, 
                                        florida_attractions_count_neg, 
                                        japan_attractions_count, 
                                        japan_attractions_count_pos, 
                                        japan_attractions_count_neg, 
                                        singapore_attractions_count, 
                                        singapore_attractions_count_pos, 
                                        singapore_attractions_count_neg], axis=1)
df_final_attractions_count = df_final_attractions_count.reset_index()

In [None]:
df_final_attractions_count.to_csv("drive/My Drive/Colab Notebooks/TCC/dataset_attractions_count.csv",index=False)