# Text Mining

## Load Packages

In [1]:
# Import packages to use later
import pandas as pd
import nltk
import time
import json

## Load & Process Relationship Types

In [2]:
# Why should we set up the relationships data like this?
# 1) Prevents need for stemming each word in each review (time-consuming)
# 2) Allows arbitrary groupings of relationships beyond stemming equivalence
#    (e.g. "roommate" and "housemate", "children" and "kids", etc.)

with open("relationships/Relationships_ATUS_custom_v2.txt") as f:
    relationships = [line.strip().lower() for line in f.readlines()]

relationships_dict = dict()  ## From relationship category to all relevant relationship words, e.g. "spouse" --> ["spouse", "partner"]
relationships_dict_reverse = dict()  ## From any relationship word to its category, e.g. "partner" --> "spouse"

for line in relationships:
    relevant_words = line.split(",")
    category = relevant_words[0]
    relationships_dict[category] = relevant_words.copy()
    for word in relevant_words:
        relationships_dict_reverse[word] = category

full_relationship_set = set()
for relationship_list in relationships_dict.values():
    full_relationship_set.update(relationship_list)
        
print(relationships)
#print()
#print(relationships_dict)
#print()
#print(relationships_dict_reverse)

words_need_our = ["child", "children", "kid", "kids", "son", "sons", "daughter", "daughters"]

full_relationship_set_new = set()
for word in full_relationship_set:
    full_relationship_set_new.add("my_" + word)
    if word in words_need_our:
        full_relationship_set_new.add("our_" + word)
print(full_relationship_set_new)

['child,children,kid,kids', 'daughter,daughters', 'son,sons', 'parent,parents', 'mother,mom', 'father,dad', 'brother,brothers', 'sister,sisters', 'siblings', 'aunt,aunts', 'uncle,uncles', 'niece,nieces', 'nephew,nephews', 'cousin,cousins', 'grandchild,grandchildren', 'grandmother,grandma', 'grandfather,grandpa', 'grandparents', 'spouse', 'partner', 'husband', 'wife', 'bff', 'relationship', 'date', 'boo,bae,sweetheart', 'fiancee,fiance', 'girlfriend,gf', 'boyfriend,bf', 'friend,friends,buddy,buddies,pal,pals', 'housemate,housemates,roommate,roommates,flatmate,flatmates', 'neighbor,neighbors', 'classmate,classmates', 'professor,professors', 'teacher,teachers', 'coworker,coworkers,colleague,colleagues', 'client,clients', 'boss']
{'my_clients', 'my_grandpa', 'my_nephew', 'my_daughter', 'my_siblings', 'my_brothers', 'my_dad', 'my_flatmates', 'my_pals', 'my_classmate', 'my_boyfriend', 'my_professors', 'my_colleague', 'my_sweetheart', 'my_mother', 'my_sister', 'my_coworkers', 'my_cousins', 'm

## Text Mining

In [3]:
metros = ['Indianapolis', 'Philadelphia', 'Tucson', 'Tampa', 'Nashville']

metro = 'Nashville'
# metro_reviews = pd.read_csv("small_reviews/yelp_academic_dataset_reviews_" + metro + ".csv")
metro_reviews = pd.read_csv("small_reviews_urbcomp/yelp_academic_dataset_reviews_" + metro + ".csv")
metro_reviews.head(3)

Unnamed: 0.1,Unnamed: 0,funny,useful,review_id,text,business_id,stars,date,user_id,cool,datetime
0,29,0,0,elqRpX9T3YwL07uLNtN3Bg,I at least have to give this restaurant two st...,ltBBYdNzkeKdCNPDAsxwAA,2.0,2015-02-02 04:29:13,-sryo4gDYxbZ1T5Bz4l5Bw,0,2015-02-02 04:29:13
1,34,0,0,p198qZsKOMCUhgdtRWsOKQ,After my ROTD yesterday of a different Sweet ...,8QnuWGVNBhzyYXGSeRdi4g,4.0,2013-10-24 19:24:33,3MpDvy5gEdsbZh9-p92dHg,0,2013-10-24 19:24:33
2,39,0,0,E9AB7V4z8xrt2uPF7T55FQ,Amazing biscuits and (fill in the blank). Grea...,Zx7n8mdt8OzLRXVzolXNhQ,5.0,2018-04-27 23:03:21,iYY5Ii1LGpZCpXFkHlMefw,0,2018-04-27 23:03:21


In [4]:
t0 = time.time()

# Clean review text by making everything lowercase
metro_reviews["text_clean"] = metro_reviews["text"].str.lower()

metro_reviews["text_clean"] = metro_reviews["text_clean"].str.replace("my ", "my_")
metro_reviews["text_clean"] = metro_reviews["text_clean"].str.replace("our ", "our_")

#print(metro_reviews[metro_reviews["review_id"] == "k9vlSSUStwY2DcjM8Rinnw"].iloc[0, -1])

# Apply tokenizer and get rid of punctuation
tokenizer = nltk.RegexpTokenizer(r"\w+")
metro_reviews["text_clean"] = metro_reviews["text_clean"].fillna("0")
metro_reviews["text_clean"] = metro_reviews["text_clean"].apply(tokenizer.tokenize)

# Remove duplicate words in each review
metro_reviews["text_clean"] = metro_reviews["text_clean"].apply(set)

# Join tokens with spaces into strings for easy word counting
metro_reviews["text_clean"] = metro_reviews["text_clean"].apply(" ".join)

t1 = time.time()
print(t1-t0, "sec")

15.083009004592896 sec


In [5]:
metro_reviews.head(3)

Unnamed: 0.1,Unnamed: 0,funny,useful,review_id,text,business_id,stars,date,user_id,cool,datetime,text_clean
0,29,0,0,elqRpX9T3YwL07uLNtN3Bg,I at least have to give this restaurant two st...,ltBBYdNzkeKdCNPDAsxwAA,2.0,2015-02-02 04:29:13,-sryo4gDYxbZ1T5Bz4l5Bw,0,2015-02-02 04:29:13,had it see she and this restaurant dinner afte...
1,34,0,0,p198qZsKOMCUhgdtRWsOKQ,After my ROTD yesterday of a different Sweet ...,8QnuWGVNBhzyYXGSeRdi4g,4.0,2013-10-24 19:24:33,3MpDvy5gEdsbZh9-p92dHg,0,2013-10-24 19:24:33,had it my_rotd and toppings paying my_local af...
2,39,0,0,E9AB7V4z8xrt2uPF7T55FQ,Amazing biscuits and (fill in the blank). Grea...,Zx7n8mdt8OzLRXVzolXNhQ,5.0,2018-04-27 23:03:21,iYY5Ii1LGpZCpXFkHlMefw,0,2018-04-27 23:03:21,bit cocktails too biscuits amazing and highly ...


In [6]:
t0 = time.time()

relationship_categories = sorted(relationships_dict.keys())
relationship_categories_ungroup = sorted(relationships_dict_reverse.keys())

df_rows = []
df_rows_ungroup = []

for i, business_id in enumerate(metro_reviews.business_id.unique()):
    reviews_subset = metro_reviews[metro_reviews["business_id"] == business_id]
    reviews_subset_counts = reviews_subset.text_clean.str.split().explode().value_counts().reset_index()
    x = reviews_subset_counts[reviews_subset_counts["index"].isin(full_relationship_set_new)]
#     print(x)
#     break
    df_row = [business_id, len(reviews_subset)] + [0 for key in relationship_categories]
    df_row_ungroup = [business_id, len(reviews_subset)] + [0 for key in relationship_categories_ungroup]

    for row in x.itertuples():
        key = row.index.split("_")[-1]
        
        df_row_idx = 2 + relationship_categories.index(relationships_dict_reverse[key])
        df_row[df_row_idx] += row.text_clean
        
        df_row_idx_ungroup = 2 + relationship_categories_ungroup.index(key)
        df_row_ungroup[df_row_idx_ungroup] += row.text_clean
        
    df_rows.append(df_row)
    df_rows_ungroup.append(df_row_ungroup)

relationship_df = pd.DataFrame(df_rows,columns = ["business_id", "num_reviews"] + relationship_categories)
relationship_df['num_relationship_words'] = relationship_df[relationship_categories].sum(axis=1)
relationship_df = relationship_df[["business_id", "num_reviews", "num_relationship_words"] + relationship_categories]

relationship_df_ungroup = pd.DataFrame(df_rows_ungroup,columns = ["business_id", "num_reviews"] + relationship_categories_ungroup)
relationship_df_ungroup['num_relationship_words'] = relationship_df_ungroup[relationship_categories_ungroup].sum(axis=1)
relationship_df_ungroup = relationship_df_ungroup[["business_id", "num_reviews", "num_relationship_words"] + relationship_categories_ungroup]

t1 = time.time()
print(t1-t0, "sec")

98.14146780967712 sec


In [7]:
relationship_df.to_csv("output_urbcomp/" + metro + "_counts_v2.csv", index=False)
relationship_df_ungroup.to_csv("output_urbcomp/" + metro + "_counts_ungrouped_v2.csv", index=False)
# relationship_df.to_csv("output/" + metro + "_counts_v2.csv", index=False)
# relationship_df_ungroup.to_csv("output/" + metro + "_counts_ungrouped_v2.csv", index=False)