# Find the topics about Seoul

### Topic Modeling(LDA) with KOT POI data

Which topic is the most relevant with Seoul?
Let's find it from overview text from KOT POI data about Seoul

In [1]:
import pandas as pd
import numpy as np
import gensim
import nltk
import time

from matplotlib import pyplot as plt
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import CoherenceModel

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from pprint import pprint
# from text_cleaning import lemmatize_and_stem, preprocess, get_aggregate_score, replace_periods

from functions import remove_html, remove_newline

In [2]:
# set database connection
from pymongo import MongoClient 
client = MongoClient('localhost', 27017)

In [3]:
# set database
db = client['cp_seoul']

In [4]:
# set collection
coll = db['seoul_poi']

In [5]:
# test db connection (get first record from db)
cursor = coll.find({})
cursor.next()

{'_id': ObjectId('5db26b80c36b1f23ca16b1d9'),
 'addr1': '38, Donggyo-ro 29-gil, Mapo-gu, Seoul',
 'areacode': '1',
 'cat1': 'A05',
 'cat2': 'A0502',
 'cat3': 'A05020900',
 'contentid': '2480979',
 'contenttypeid': '82',
 'createdtime': '20170214145634',
 'firstimage': 'http://tong.visitkorea.or.kr/cms/resource/87/2479687_image2_1.jpg',
 'firstimage2': 'http://tong.visitkorea.or.kr/cms/resource/87/2479687_image3_1.jpg',
 'mapx': '126.9230428662',
 'mapy': '37.5621620943',
 'masterid': '2479328',
 'modifiedtime': '20190125151640',
 'readcount': '3832',
 'sigungucode': '13',
 'title': '17℃ (17도씨)',
 'zipcode': '03984',
 'directions': '[Subway]<br />\nHongik Univ. Station (Seoul Subway Line 2, Gyeongui-Jungang Line, Airport Railroad), Exit 3.<br />\nWalk for approx. 5 min.<br><br>\n\n[Bus]<br />\nKolon Apt. 106-dong Entrance Bus Stop<br />\nKolon Apt. 104-dong Entrance Bus Stop<br />\nDaemyeong Apt. Bus Stop<br />\nMaeul Mapo 05',
 'dongcode': '27',
 'homepage': '<a title="open in new wind

cf)  
    82: "음식(82)Dining",
    76: "관광지(76)TouristAttractions",
    79: "쇼핑(79)Shopping",
    78: "문화시설(78)Cultural Facilities",
    80: "숙박(80)Accommodation",
    85: "축제/공연/행사(85)Festivals/Events/Performances",
    75: "레포츠(75)Leisure/Sports",
    77: "교통(77)Transportation",    

In [6]:
# load data to dataframe (only Tourist attraction and Culutral Facilities)
cursor = coll.find({'contenttypeid': {"$in": ['76', '78', ]}})
df = pd.DataFrame(list(cursor))

In [7]:
df.head().T

Unnamed: 0,0,1,2,3,4
_id,5db26b80c36b1f23ca16b1da,5db26b80c36b1f23ca16b1de,5db26b80c36b1f23ca16b1e0,5db26b80c36b1f23ca16b1e1,5db26b80c36b1f23ca16b1e4
addr1,"31, Daehak-ro 12-gil, Jongno-gu, Seoul","50, 63-ro, Yeongdeungpo-gu, Seoul","50, 63-ro, Yeongdeungpo-gu, Seoul","20-1, Samil-daero 8-gil, Jung-gu, Seoul","21, Apgujeong-ro 29-gil, Gangnam-gu, Seoul"
addr2,,,,,
areacode,1,1,1,1,1
cat1,A02,A02,A02,A02,A02
cat2,A0206,A0206,A0205,A0203,A0202
cat3,A02060600,A02060500,A02050600,A02030400,A02020300
contentid,1240735,621155,264122,2590011,1295190
contenttypeid,78,78,76,76,76
createdtime,20110324102837,20080908171332,20021228150750,20190211130745,20110601095512


In [8]:
# get overview text of POIs
overviews = list(df['overview'])

In [9]:
overviews[:5]

['1m Classic Art Hall, located in Daehang-ro is a classic art experience center exclusively for children. The center provides children with the opportunity to enjoy classical music in a fun and interesting way. In the ‘1m Experience Classic’ program (the oldest children’s classical program in the nation), teachers not only provide interesting mini-music learns, but also assist children in trying out a variety of instruments.<br><br>\n\nAt the performance halls at the 1m Classic Art Hall, the stage is just 1m away from the surrounding seats, giving young audience members a more intimate musical experience.',
 '63 City in Yeouido stands 264 meters above sea level and is considered one of Seoul’s most well known landmarks. The 60th floor ‘Sky Deck’ of 63 City Building was remodeled into an art museum, offering a place for art exhibitions, magic shows, and various cultural experiences. Visitors will also be able to enjoy the beautiful scenery of Hangang River through the observatory’s larg

In [10]:
# need to clean the markup characters (<Br>, \n, ...)
overviews = list(map(remove_html, overviews))
overviews = list(map(remove_newline, overviews))

In [11]:
overviews[:5]

['1m Classic Art Hall, located in Daehang-ro is a classic art experience center exclusively for children. The center provides children with the opportunity to enjoy classical music in a fun and interesting way. In the ‘1m Experience Classic’ program (the oldest children’s classical program in the nation), teachers not only provide interesting mini-music learns, but also assist children in trying out a variety of instruments.\n\nAt the performance halls at the 1m Classic Art Hall, the stage is just 1m away from the surrounding seats, giving young audience members a more intimate musical experience.',
 '63 City in Yeouido stands 264 meters above sea level and is considered one of Seoul’s most well known landmarks. The 60th floor ‘Sky Deck’ of 63 City Building was remodeled into an art museum, offering a place for art exhibitions, magic shows, and various cultural experiences. Visitors will also be able to enjoy the beautiful scenery of Hangang River through the observatory’s large window

#### Tokenize with Lemmatization and Stemming

In [12]:
from gensim.utils import simple_preprocess

def lemmatize_and_stem(text):
    """Return Lemmatized and stemmed text"""
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos="v"))


In [13]:
# NLTK Stop words
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
#extends stopwords for this dataset
stopwords.extend([
    "korea",
    "korean",
    "dong",
    "visit",
    "include",
    "area",
    "food",
    "serve",
    "dish",
    "restaurant",
    "offer",
    "center",
    "hotel",
    "guest",
    "product"
])


In [14]:
# preprocess function
def preprocess(text):
    """Return lowercase, lemmatize, and stem text"""
    result = []

    for token in simple_preprocess(text):
        if token not in stopwords:
            result.append(lemmatize_and_stem(token))
    return result

In [15]:
# perform lemmatize_and_stem and remove stop words
overviews_token = list(map(preprocess, overviews))

In [16]:
len(overviews_token)

423

In [17]:
# Create dictionary
id2word = gensim.corpora.Dictionary(overviews_token)

In [18]:
# filtering extreme words
id2word.filter_extremes(no_below=5, no_above=0.30)

In [19]:
# make Term Document Frequency (corpus)
corpus = [id2word.doc2bow(overview) for overview in overviews_token]

In [20]:
len(corpus)

423

In [21]:
# build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus, 
                                            num_topics=5,
                                            id2word=id2word,
                                            passes=20,
                                            alpha='auto')


In [22]:
# print topics
pprint(lda_model.print_topics())

[(0,
  '0.028*"perform" + 0.026*"art" + 0.025*"hall" + 0.022*"theater" + '
  '0.015*"visitor" + 0.013*"experi" + 0.013*"exhibit" + 0.011*"program" + '
  '0.010*"music" + 0.010*"floor"'),
 (1,
  '0.043*"museum" + 0.034*"art" + 0.023*"exhibit" + 0.015*"tradit" + '
  '0.014*"galleri" + 0.010*"open" + 0.010*"shop" + 0.010*"street" + '
  '0.009*"display" + 0.009*"modern"'),
 (2,
  '0.034*"templ" + 0.013*"mountain" + 0.012*"church" + 0.012*"build" + '
  '0.012*"tower" + 0.011*"place" + 0.011*"spa" + 0.010*"year" + 0.009*"namsan" '
  '+ 0.008*"buddhist"'),
 (3,
  '0.061*"park" + 0.029*"mountain" + 0.017*"river" + 0.016*"hangang" + '
  '0.010*"tomb" + 0.010*"bridg" + 0.010*"natur" + 0.009*"forest" + '
  '0.009*"visitor" + 0.009*"fortress"'),
 (4,
  '0.033*"build" + 0.032*"palac" + 0.030*"gate" + 0.027*"king" + 0.016*"hall" '
  '+ 0.014*"joseon" + 0.013*"hous" + 0.013*"dynasti" + 0.011*"royal" + '
  '0.009*"nation"')]


*Overview text of POIs have 5 topics*

* Namsan [3]
* Park and Nature (Hangang(River)) [5]
* Museum (Traditional and Historical) [2]
* Art (Exhibition and Performs) [1]
* Interesting Area (Hongdae, Gangnam, Garosugil) [4]


#### Visualize the topics-keywords

In [23]:
# import Plotting tools (pyLDAvis)
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [24]:
# Draw plot with LDA Model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


*5 topics*

* 1 - Art (Exhibition and Performs)
* 2 - Museum (Traditional and Historical)
* 3 - Namsan
* 4 - Interesting Area (Hongdae, Gangnam, Garosugil)
* 5 - Park and Nature (Hangang(River))






# Conclusion

## LDA modeling can find meaningful topics from overview text