# Import Libraries

In [1]:
import json
import csv
import ast
import pandas as pd 
import re
import datetime
import time
import numpy as np
import nltk
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.sentiment import SentimentIntensityAnalyzer

import matplotlib.pyplot as plt
%matplotlib inline

# Import Files

The imported files will use json.loads to load, while will output list of dictionaries.

Each dictionary will represent one post inclusive of the following details:
- Post title
- Post score
- Post ID
- Post URL
- Post comments (list)

In [71]:

sg_bus = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/Singapore_bus.json', 'r', encoding="utf8").read())
sg_crent = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/Singapore_car rental.json', 'r', encoding="utf8").read())
sg_mrt = json.loads(open('../..//Data Collection and Preprocessing/Scrapping/Reddit/Singapore_mrt.json', 'r', encoding="utf8").read())
sg_ptrans = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/Singapore_public transport.json', 'r', encoding="utf8").read())
sg_rentacar = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/Singapore_rent-a-car.json', 'r', encoding="utf8").read())
sg_taxi = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/Singapore_taxi.json', 'r', encoding="utf8").read())
sg_transdemand = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/Singapore_transport demand.json', 'r', encoding="utf8").read())
sg_trans = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/Singapore_transport.json', 'r', encoding="utf8").read())
sg_demand = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/Singapore_demand.json', 'r', encoding="utf8").read())

asksg_bus = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/askSingapore_bus.json', 'r', encoding="utf8").read())
asksg_crent = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/askSingapore_car rental.json', 'r', encoding="utf8").read())
asksg_cdg = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/askSingapore_comfortdelgro.json', 'r', encoding="utf8").read())
asksg_mrt = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/askSingapore_mrt.json', 'r', encoding="utf8").read())
asksg_ptrans = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/askSingapore_public transport.json', 'r', encoding="utf8").read())
asksg_rentacar = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/askSingapore_rent-a-car.json', 'r', encoding="utf8").read())
asksg_taxi = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/askSingapore_taxi.json', 'r', encoding="utf8").read())
asksg_transdemand = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/askSingapore_transport demand.json', 'r', encoding="utf8").read())
asksg_trans = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/askSingapore_transport.json', 'r', encoding="utf8").read())
asksg_demand = json.loads(open('../../Data Collection and Preprocessing/Scrapping/Reddit/askSingapore_demand.json', 'r', encoding="utf8").read())


Let's preview one of the file

In [72]:
sg_transdemand

[{'title': 'Why are NTU students so outraged/pissy/strawberry/entitled?',
  'time created': '03/07/2021',
  'score': 3062,
  'id': 'ocpmau',
  'url': 'https://www.reddit.com/r/singapore/comments/ocpmau/why_are_ntu_students_so/',
  'comments': [{'time created': '04/07/2021',
    'author': 'AutoModerator',
    'score': 1,
    'comment': 'OP has flaired the post with the "Serious Discussion" flair. We will be placing this thread in contest mode in addition to exercising greater moderation in the comments section against joke, irrelevant or off-topic comments. These comments will be removed and offenders may face restrictions in accessing /r/singapore. Please report such posts and comments. OPs must also engage in a bona fide discussion, i.e. the post should not be one just to incite outrage. \n\nNote that the \'report\' button is not an \'I disagree\' button or a super-downvote, it will simply flag it for the mods\' attention, and the comment will be evaluated for removal then.\n\n\n*I am

# Data Cleaning

Data cleaning is the process of fixing or removing incorrect, corrupted, incorrectly formatted, duplicate, or incomplete data within a dataset. 

The following are the relevant steps needed to clean the data:
1. Unify text (small letter, white space)
2. Check for removed comments
3. Remove irrelevant post
4. Check for duplicates
5. Combine data of similar category into 1 list

# Save Title and Comment into 1 data frame

These are the relevant columns:
1. Text
2. Author
3. Date
4. Score

In [74]:
keyword = ['bus', 'mrt', 'taxi', 'car rent', 'car rental', 'rental car', 'rent car', 'bus rent', 'bus rental', 'rental bus', 'rent bus', 'chartered bus', 'charter bus', 'rent', 'transport', 'grab', 'uber', 'lyft', 'bluesg', 'comfortdelgro', 'delgro', 'gojek', 'transportation', 'demand', 'private hire']

## Define Data Cleaning Function

The following are the functions defined to help clean the dataset:
1. lower_case() - standadize all text to lower case
2. remove_space() - Remove \n and other spaces
3. remove_removed_deleted() - Remove comments that are 'removed' and 'deleted'
4. remove_irrelevant() - Remove post that do not talk about the relevant topics
5. remove_duplicate() - Remove duplicated post

In [76]:
def lower_case(dataset):
  for post in dataset:
    # Lowercase the title
    post['title'] = str(post['title']).lower()

    for com in post['comments']:
      # Lowercase the comments
      com['comment'] = com['comment'].lower()

In [78]:
def remove_space(dataset):
    for post in dataset:
      # Remove whitespace for title
      post['title'] = str(post['title']).strip(" ")
      post['title'] = re.sub(r'\\n', '', str(post['title']))
      post['title'] = re.sub(r'\n\n', '', str(post['title']))
      post['title'] = re.sub(r'\n', '', str(post['title']))
      post['title'] = str(post['title']).replace('\s+', ' ')

      # Remove whitespace for comments
      for com in post['comments']:
        com['comment'] = com['comment'].strip(" ")
        com['comment'] = re.sub(r'\\n', '', com['comment'])
        com['comment'] = re.sub(r'\n\n', '', com['comment'])
        com['comment'] = re.sub(r'\n', '', com['comment'])
        com['comment'] = com['comment'].replace('\s+', ' ')

In [80]:
def remove_removed_deleted(dataset):
  for post in dataset:
    for i in range(len(post['comments'])):
      post['comments'][:] = [d for d in post['comments'] if d.get('comment') != '[removed]']
      post['comments'][:] = [d for d in post['comments'] if d.get('comment') != '[deleted]']

In [82]:
def remove_irrelevant(dataset, keyword):
  new_list = []

  for word in keyword:    
    for post in dataset:
      if word in post['title']:
        new_list.append(post)
  return new_list

In [84]:
def remove_duplicate(dataset):
  check_list = []
  check_title = []

  for post in dataset:
    if post['title'] not in check_title:
      check_list.append(post)
      check_title.append(post['title'])
  return check_list

## Run the Data Cleaning Functions

1. lower_case()

In [86]:
lower_case(sg_bus)
lower_case(sg_crent)
lower_case(sg_mrt)
lower_case(sg_ptrans)
lower_case(sg_rentacar)
lower_case(sg_taxi)
lower_case(sg_transdemand)
lower_case(sg_trans)
lower_case(sg_demand)

lower_case(asksg_bus)
lower_case(asksg_crent)
lower_case(asksg_cdg)
lower_case(asksg_mrt)
lower_case(asksg_ptrans)
lower_case(asksg_rentacar)
lower_case(asksg_taxi)
lower_case(asksg_transdemand)
lower_case(asksg_trans)
lower_case(asksg_demand)

2. remove_space()

In [88]:
remove_space(sg_bus)
remove_space(sg_crent)
remove_space(sg_mrt)
remove_space(sg_ptrans)
remove_space(sg_rentacar)
remove_space(sg_taxi)
remove_space(sg_transdemand)
remove_space(sg_trans)
remove_space(sg_demand)

remove_space(asksg_bus)
remove_space(asksg_crent)
remove_space(asksg_cdg)
remove_space(asksg_mrt)
remove_space(asksg_ptrans)
remove_space(asksg_rentacar)
remove_space(asksg_taxi)
remove_space(asksg_transdemand)
remove_space(asksg_trans)
remove_space(asksg_demand)

3. remove_removed_deleted()

In [90]:
remove_removed_deleted(sg_bus)
remove_removed_deleted(sg_crent)
remove_removed_deleted(sg_mrt)
remove_removed_deleted(sg_ptrans)
remove_removed_deleted(sg_rentacar)
remove_removed_deleted(sg_taxi)
remove_removed_deleted(sg_transdemand)
remove_removed_deleted(sg_trans)
remove_removed_deleted(sg_demand)

remove_removed_deleted(asksg_bus)
remove_removed_deleted(asksg_crent)
remove_removed_deleted(asksg_cdg)
remove_removed_deleted(asksg_mrt)
remove_removed_deleted(asksg_ptrans)
remove_removed_deleted(asksg_rentacar)
remove_removed_deleted(asksg_taxi)
remove_removed_deleted(asksg_transdemand)
remove_removed_deleted(asksg_trans)
remove_removed_deleted(asksg_demand)

4. remove _irrelevant()

In [92]:
sg_bus = remove_irrelevant(sg_bus, keyword)
sg_crent = remove_irrelevant(sg_crent, keyword)
sg_mrt = remove_irrelevant(sg_mrt, keyword)
sg_ptrans = remove_irrelevant(sg_ptrans, keyword)
sg_rentacar = remove_irrelevant(sg_rentacar, keyword)
sg_taxi = remove_irrelevant(sg_taxi, keyword)
sg_transdemand = remove_irrelevant(sg_transdemand, keyword)
sg_trans = remove_irrelevant(sg_trans, keyword)
sg_demand = remove_irrelevant(sg_demand, keyword)

asksg_bus = remove_irrelevant(asksg_bus, keyword)
asksg_crent = remove_irrelevant(asksg_crent, keyword)
asksg_cdg = remove_irrelevant(asksg_cdg, keyword)
asksg_mrt = remove_irrelevant(asksg_mrt, keyword)
asksg_ptrans = remove_irrelevant(asksg_ptrans, keyword)
asksg_rentacar = remove_irrelevant(asksg_rentacar, keyword)
asksg_taxi = remove_irrelevant(asksg_taxi, keyword)
asksg_transdemand = remove_irrelevant(asksg_transdemand, keyword)
asksg_trans = remove_irrelevant(asksg_trans, keyword)
asksg_demand = remove_irrelevant(asksg_demand, keyword)

5. remove_duplicate()

In [94]:
sg_bus = remove_duplicate(sg_bus)
sg_crent = remove_duplicate(sg_crent)
sg_mrt = remove_duplicate(sg_mrt)
sg_ptrans = remove_duplicate(sg_ptrans)
sg_rentacar = remove_duplicate(sg_rentacar)
sg_taxi = remove_duplicate(sg_taxi)
sg_transdemand = remove_duplicate(sg_transdemand)
sg_trans = remove_duplicate(sg_trans)
sg_demand = remove_duplicate(sg_demand)

asksg_bus = remove_duplicate(asksg_bus)
asksg_crent = remove_duplicate(asksg_crent)
asksg_cdg = remove_duplicate(asksg_cdg)
asksg_mrt = remove_duplicate(asksg_mrt)
asksg_ptrans = remove_duplicate(asksg_ptrans)
asksg_rentacar = remove_duplicate(asksg_rentacar)
asksg_taxi = remove_duplicate(asksg_taxi)
asksg_transdemand = remove_duplicate(asksg_transdemand)
asksg_trans = remove_duplicate(asksg_trans)
asksg_demand = remove_duplicate(asksg_demand)

# Save Title and Comment into 1 data frame

These are the relevant columns:
1. Text
2. Author
3. Date
4. Score

In [None]:
def extract_title(json_file, )

## Categorize Relevant Post into Groups

The relevant post will be categorized into the following groups:
1. Bus
2. MRT
3. Taxi
4. Private Hire
5. Car Rental

In [96]:
data_lst = [sg_bus, sg_crent, sg_mrt, sg_ptrans, sg_rentacar, sg_taxi, sg_transdemand, sg_trans, sg_demand,
            asksg_bus, asksg_crent, asksg_cdg, asksg_mrt, asksg_ptrans, asksg_rentacar, asksg_taxi, asksg_transdemand, asksg_trans, asksg_demand]


### Define Function for Categorization

In [98]:
def categorize(list, dataset, group_keyword, stop):
  
  for group in group_keyword:
    if len(stop) > 0:
      for s in stop:
        for post in dataset:
          if group in post['title'] and s not in post['title']:
            list.append(post)
    else:
      for post in dataset:
        if group in post['title']:
          list.append(post)

### Run Categorization Function

1. Bus

In [100]:
bus = []

b_stop = ['rent', 'charter', 'demands']

categorize(bus, sg_bus, ['bus'], b_stop)
categorize(bus, sg_crent, ['bus'], b_stop)
categorize(bus, sg_mrt, ['bus'], b_stop)
categorize(bus, sg_ptrans, ['bus'], b_stop)
categorize(bus, sg_rentacar, ['bus'], b_stop)
categorize(bus, sg_taxi, ['bus'], b_stop)
categorize(bus, sg_transdemand, ['bus'], b_stop)
categorize(bus, sg_trans, ['bus'], b_stop)
categorize(bus, sg_demand, ['bus'], b_stop)

categorize(bus, asksg_bus, ['bus'], b_stop)
categorize(bus, asksg_crent, ['bus'], b_stop)
categorize(bus, asksg_cdg, ['bus'], b_stop)
categorize(bus, asksg_mrt, ['bus'], b_stop)
categorize(bus, asksg_ptrans, ['bus'], b_stop)
categorize(bus, asksg_rentacar, ['bus'], b_stop)
categorize(bus, asksg_taxi, ['bus'], b_stop)
categorize(bus, asksg_transdemand, ['bus'], b_stop)
categorize(bus, asksg_trans, ['bus'], b_stop)
categorize(bus, asksg_demand, ['bus'], b_stop)

2. Taxi

In [102]:
taxi = []

t_stop = ['grab', 'gojek', 'delgro', 'ryde', 'tada']

categorize(taxi, sg_bus, ['taxi'], t_stop)
categorize(taxi, sg_crent, ['taxi'], t_stop)
categorize(taxi, sg_mrt, ['taxi'], t_stop)
categorize(taxi, sg_ptrans, ['taxi'], t_stop)
categorize(taxi, sg_rentacar, ['taxi'], t_stop)
categorize(taxi, sg_taxi, ['taxi'], t_stop)
categorize(taxi, sg_transdemand, ['taxi'], t_stop)
categorize(taxi, sg_trans, ['taxi'], t_stop)
categorize(taxi, sg_demand, ['taxi'], t_stop)

categorize(taxi, asksg_bus, ['taxi'], t_stop)
categorize(taxi, asksg_crent, ['taxi'], t_stop)
categorize(taxi, asksg_cdg, ['taxi'], t_stop)
categorize(taxi, asksg_mrt, ['taxi'], t_stop)
categorize(taxi, asksg_ptrans, ['taxi'], t_stop)
categorize(taxi, asksg_rentacar, ['taxi'], t_stop)
categorize(taxi, asksg_taxi, ['taxi'], t_stop)
categorize(taxi, asksg_transdemand, ['taxi'], t_stop)
categorize(taxi, asksg_trans, ['taxi'], t_stop)
categorize(taxi, asksg_demand, ['taxi'], t_stop)

3. Grab

In [104]:
grab = []

g_stop = ['taxi', 'gojek', 'delgro', 'ryde', 'tada']

categorize(grab, sg_bus, ['grab'], g_stop)
categorize(grab, sg_crent, ['grab'], g_stop)
categorize(grab, sg_mrt, ['grab'], g_stop)
categorize(grab, sg_ptrans, ['grab'], g_stop)
categorize(grab, sg_rentacar, ['grab'], g_stop)
categorize(grab, sg_taxi, ['grab'], g_stop)
categorize(grab, sg_transdemand, ['grab'], g_stop)
categorize(grab, sg_trans, ['grab'], g_stop)
categorize(grab, sg_demand, ['grab'], g_stop)

categorize(grab, asksg_bus, ['grab'], g_stop)
categorize(grab, asksg_crent, ['grab'], g_stop)
categorize(grab, asksg_cdg, ['grab'], g_stop)
categorize(grab, asksg_mrt, ['grab'], g_stop)
categorize(grab, asksg_ptrans, ['grab'], g_stop)
categorize(grab, asksg_rentacar, ['grab'], g_stop)
categorize(grab, asksg_taxi, ['grab'], g_stop)
categorize(grab, asksg_transdemand, ['grab'], g_stop)
categorize(grab, asksg_trans, ['grab'], g_stop)
categorize(grab, asksg_demand, ['grab'], g_stop)

4. gojek

In [106]:
gojek = []

gj_stop = ['taxi', 'grab', 'delgro', 'ryde', 'tada']

categorize(gojek, sg_bus, ['gojek'], gj_stop)
categorize(gojek, sg_crent, ['gojek'], gj_stop)
categorize(gojek, sg_mrt, ['gojek'], gj_stop)
categorize(gojek, sg_ptrans, ['gojek'], gj_stop)
categorize(gojek, sg_rentacar, ['gojek'], gj_stop)
categorize(gojek, sg_taxi, ['gojek'], gj_stop)
categorize(gojek, sg_transdemand, ['gojek'], gj_stop)
categorize(gojek, sg_trans, ['gojek'], gj_stop)
categorize(gojek, sg_demand, ['gojek'], gj_stop)

categorize(gojek, asksg_bus, ['gojek'], gj_stop)
categorize(gojek, asksg_crent, ['gojek'], gj_stop)
categorize(gojek, asksg_cdg, ['gojek'], gj_stop)
categorize(gojek, asksg_mrt, ['gojek'], gj_stop)
categorize(gojek, asksg_ptrans, ['gojek'], gj_stop)
categorize(gojek, asksg_rentacar, ['gojek'], gj_stop)
categorize(gojek, asksg_taxi, ['gojek'], gj_stop)
categorize(gojek, asksg_transdemand, ['gojek'], gj_stop)
categorize(gojek, asksg_trans, ['gojek'], gj_stop)
categorize(gojek, asksg_demand, ['gojek'], gj_stop)

In [108]:
len(gojek)

15

5. Comfort Delgro

In [110]:
c_delgro = []

cdg_stop = ['taxi', 'grab', 'gojek', 'ryde', 'tada']

categorize(c_delgro, sg_bus, ['delgro'], gj_stop)
categorize(c_delgro, sg_crent, ['delgro'], gj_stop)
categorize(c_delgro, sg_mrt, ['delgro'], gj_stop)
categorize(c_delgro, sg_ptrans, ['delgro'], gj_stop)
categorize(c_delgro, sg_rentacar, ['delgro'], gj_stop)
categorize(c_delgro, sg_taxi, ['delgro'], gj_stop)
categorize(c_delgro, sg_transdemand, ['delgro'], gj_stop)
categorize(c_delgro, sg_trans, ['delgro'], gj_stop)
categorize(c_delgro, sg_demand, ['delgro'], gj_stop)

categorize(c_delgro, asksg_bus, ['delgro'], gj_stop)
categorize(c_delgro, asksg_crent, ['delgro'], gj_stop)
categorize(c_delgro, asksg_cdg, ['delgro'], gj_stop)
categorize(c_delgro, asksg_mrt, ['delgro'], gj_stop)
categorize(c_delgro, asksg_ptrans, ['delgro'], gj_stop)
categorize(c_delgro, asksg_rentacar, ['delgro'], gj_stop)
categorize(c_delgro, asksg_taxi, ['delgro'], gj_stop)
categorize(c_delgro, asksg_transdemand, ['delgro'], gj_stop)
categorize(c_delgro, asksg_trans, ['delgro'], gj_stop)
categorize(c_delgro, asksg_demand, ['delgro'], gj_stop)

6. Ryde

In [112]:
ryde = []

ryde_stop = ['taxi', 'grab', 'gojek', 'delgro', 'tada']

categorize(ryde, sg_bus, ['ryde'], ryde_stop)
categorize(ryde, sg_crent, ['ryde'], ryde_stop)
categorize(ryde, sg_mrt, ['ryde'], ryde_stop)
categorize(ryde, sg_ptrans, ['ryde'], ryde_stop)
categorize(ryde, sg_rentacar, ['ryde'], ryde_stop)
categorize(ryde, sg_taxi, ['ryde'], ryde_stop)
categorize(ryde, sg_transdemand, ['ryde'], ryde_stop)
categorize(ryde, sg_trans, ['ryde'], ryde_stop)
categorize(ryde, sg_demand, ['ryde'], ryde_stop)

categorize(ryde, asksg_bus, ['ryde'], ryde_stop)
categorize(ryde, asksg_crent, ['ryde'], ryde_stop)
categorize(ryde, asksg_cdg, ['ryde'], ryde_stop)
categorize(ryde, asksg_mrt, ['ryde'], ryde_stop)
categorize(ryde, asksg_ptrans, ['ryde'], ryde_stop)
categorize(ryde, asksg_rentacar, ['ryde'], ryde_stop)
categorize(ryde, asksg_taxi, ['ryde'], ryde_stop)
categorize(ryde, asksg_transdemand, ['ryde'], ryde_stop)
categorize(ryde, asksg_trans, ['ryde'], ryde_stop)
categorize(ryde, asksg_demand, ['ryde'], ryde_stop)

7. Tada Mobility

In [114]:
tada = []

tada_stop = ['taxi', 'grab', 'gojek', 'delgro', 'ryde']

categorize(tada, sg_bus, ['tada'], tada_stop)
categorize(tada, sg_crent, ['tada'], tada_stop)
categorize(tada, sg_mrt, ['tada'], tada_stop)
categorize(tada, sg_ptrans, ['tada'], tada_stop)
categorize(tada, sg_rentacar, ['tada'], tada_stop)
categorize(tada, sg_taxi, ['tada'], tada_stop)
categorize(tada, sg_transdemand, ['tada'], tada_stop)
categorize(tada, sg_trans, ['tada'], tada_stop)
categorize(tada, sg_demand, ['tada'], tada_stop)

categorize(tada, asksg_bus, ['tada'], tada_stop)
categorize(tada, asksg_crent, ['tada'], tada_stop)
categorize(tada, asksg_cdg, ['tada'], tada_stop)
categorize(tada, asksg_mrt, ['tada'], tada_stop)
categorize(tada, asksg_ptrans, ['tada'], tada_stop)
categorize(tada, asksg_rentacar, ['tada'], tada_stop)
categorize(tada, asksg_taxi, ['tada'], tada_stop)
categorize(tada, asksg_transdemand, ['tada'], tada_stop)
categorize(tada, asksg_trans, ['tada'], tada_stop)
categorize(tada, asksg_demand, ['tada'], tada_stop)

8. Mrt

In [116]:
mrt = []

mrt_stop = []

categorize(mrt, sg_bus, [' mrt '], mrt_stop)
categorize(mrt, sg_crent, [' mrt '], mrt_stop)
categorize(mrt, sg_mrt, [' mrt '], mrt_stop)
categorize(mrt, sg_ptrans, [' mrt '], mrt_stop)
categorize(mrt, sg_rentacar, [' mrt '], mrt_stop)
categorize(mrt, sg_taxi, [' mrt '], mrt_stop)
categorize(mrt, sg_transdemand, [' mrt '], mrt_stop)
categorize(mrt, sg_trans, [' mrt '], mrt_stop)
categorize(mrt, sg_demand, [' mrt '], mrt_stop)

categorize(mrt, asksg_bus, [' mrt '], mrt_stop)
categorize(mrt, asksg_crent, [' mrt '], mrt_stop)
categorize(mrt, asksg_cdg, [' mrt '], mrt_stop)
categorize(mrt, asksg_mrt, [' mrt '], mrt_stop)
categorize(mrt, asksg_ptrans, [' mrt '], mrt_stop)
categorize(mrt, asksg_rentacar, [' mrt '], mrt_stop)
categorize(mrt, asksg_taxi, [' mrt '], mrt_stop)
categorize(mrt, asksg_transdemand, [' mrt '], mrt_stop)
categorize(mrt, asksg_trans, [' mrt '], mrt_stop)
categorize(mrt, asksg_demand, [' mrt '], mrt_stop)

9. Car rental

In [118]:
car_r = []

car_r_stop = []

categorize(car_r, sg_bus, ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, sg_crent,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, sg_mrt,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, sg_ptrans,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, sg_rentacar,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, sg_taxi,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, sg_transdemand,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, sg_trans,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, sg_demand,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)

categorize(car_r, asksg_bus,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, asksg_crent,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, asksg_cdg,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, asksg_mrt,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, asksg_ptrans,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, asksg_rentacar,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, asksg_taxi,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, asksg_transdemand,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, asksg_trans,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)
categorize(car_r, asksg_demand,  ['car rental', 'rental', 'rent car', 'rental car'], car_r_stop)

10. Bus Chartered

In [120]:
bus_c = []

bus_c_stop = []

categorize(bus_c, sg_bus, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, sg_crent,  ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, sg_mrt,  ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, sg_ptrans, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, sg_rentacar, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, sg_taxi, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, sg_transdemand, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, sg_trans, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, sg_demand, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)

categorize(bus_c, asksg_bus, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, asksg_crent, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, asksg_cdg, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, asksg_mrt, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, asksg_ptrans, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, asksg_rentacar, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, asksg_taxi, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, asksg_transdemand, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, asksg_trans, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)
categorize(bus_c, asksg_demand, ['bus charter', 'chartered bus', 'charter bus', 'charter', 'rent bus', 'rental bus', 'bus rent'], bus_c_stop)

In [122]:
for i in bus_c:
    print(i['title'])

11. Transport Demand (Everything)

In [125]:
t_demand = []

t_demand.extend(bus)
t_demand.extend(taxi)
t_demand.extend(grab)
t_demand.extend(gojek)
t_demand.extend(c_delgro)
t_demand.extend(ryde)
t_demand.extend(tada)
t_demand.extend(mrt)
t_demand.extend(car_r)
t_demand.extend(bus_c)

### Remove Duplicates from New List

In [129]:
bus = remove_duplicate(bus)
taxi = remove_duplicate(taxi)
grab = remove_duplicate(grab)
gojek = remove_duplicate(gojek)
c_delgro = remove_duplicate(c_delgro)
ryde = remove_duplicate(ryde)
tada = remove_duplicate(tada)
mrt = remove_duplicate(mrt)
car_r = remove_duplicate(car_r)
bus_c = remove_duplicate(bus_c)
t_demand = remove_duplicate(t_demand)

# Export the Final Clean Data

In [139]:
with open('../../Data Collection and Preprocessing/Scrapping/Reddit/bus(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(bus, indent=4))

with open('../../Data Collection and Preprocessing/Scrapping/Reddit/taxi(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(taxi, indent=4))

with open('../../Data Collection and Preprocessing/Scrapping/Reddit/grab(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(grab, indent=4))

with open('../../Data Collection and Preprocessing/Scrapping/Reddit/gojek(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(gojek, indent=4))

with open('../../Data Collection and Preprocessing/Scrapping/Reddit/c_delgro(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(c_delgro, indent=4))

with open('../../Data Collection and Preprocessing/Scrapping/Reddit/ryde(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(ryde, indent=4))

with open('../../Data Collection and Preprocessing/Scrapping/Reddit/tada(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(tada, indent=4))

with open('../../Data Collection and Preprocessing/Scrapping/Reddit/mrt(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(mrt, indent=4))

with open('../../Data Collection and Preprocessing/Scrapping/Reddit/car_r(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(car_r, indent=4))

with open('../../Data Collection and Preprocessing/Scrapping/Reddit/bus_c(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(bus_c, indent=4))

with open('../../Data Collection and Preprocessing/Scrapping/Reddit/t_demand(clean).json', 'w', encoding = 'utf-8') as f:
    f.write(json.dumps(t_demand, indent=4))

# Data Analysis

Area of consideration:
1. Sentiment Analysis
2. 