In [35]:
import pandas as pd

# load data
df = pd.read_csv('content/data_prep_2805_3.csv', usecols=['place', 'full_text'])
df.head()

Unnamed: 0,place,full_text
0,Waikiki,"Built on a reclaimed swamp, two miles east of ..."
1,The Florida Keys,"Folklore, films and widespread hearsay have gi..."
2,Yellowstone National Park,America’s oldest and easily its most famous na...
3,The Big Island,Although the Big Island of Hawaii could hold a...
4,The Great Plains,The rolling hills and vast grasslands of the G...


In [36]:
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation \
                                        , preprocess_string, strip_short, stem_text

# preprocess given text
def preprocess(text):

    # clean text based on given filters
    CUSTOM_FILTERS = [lambda x: x.lower(),
                                remove_stopwords,
                                strip_punctuation,
                                strip_short,
                                stem_text]
    text = preprocess_string(text, CUSTOM_FILTERS)

    return text

# apply function to all reviews
df['Text (Clean)'] = df['full_text'].apply(lambda x: preprocess(x))

In [28]:
# preview of dataset
df.head()

Unnamed: 0,place,full_text,Text (Clean)
0,Waikiki,"Built on a reclaimed swamp, two miles east of ...","[built, reclaim, swamp, mile, east, downtown, ..."
1,The Florida Keys,"Folklore, films and widespread hearsay have gi...","[folklor, film, widespread, hearsai, given, fl..."
2,Yellowstone National Park,America’s oldest and easily its most famous na...,"[america’, oldest, easili, famou, nation, park..."
3,The Big Island,Although the Big Island of Hawaii could hold a...,"[big, island, hawaii, hold, island, room, spar..."
4,The Great Plains,The rolling hills and vast grasslands of the G...,"[roll, hill, vast, grassland, great, plain, ho..."


In [29]:
# convert these processed reviews into a document-term matrix with the bag of words model
from gensim import corpora

# create a dictionary with the corpus
corpus = df['Text (Clean)']
dictionary = corpora.Dictionary(corpus)

# convert corpus into a bag of words
bow = [dictionary.doc2bow(text) for text in corpus]


In [30]:
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

# find the coherence score with a different number of topics
for i in range(2,11):
    lsi = LsiModel(bow, num_topics=i, id2word=dictionary)
    coherence_model = CoherenceModel(model=lsi, texts=df['Text (Clean)'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print('Coherence score with {} clusters: {}'.format(i, coherence_score))

Coherence score with 2 clusters: 0.36221930499264315
Coherence score with 3 clusters: 0.3592032540776124
Coherence score with 4 clusters: 0.37950650628888916
Coherence score with 5 clusters: 0.3668776376896482
Coherence score with 6 clusters: 0.37730942565183817
Coherence score with 7 clusters: 0.3515149180704032
Coherence score with 8 clusters: 0.40015528487641583
Coherence score with 9 clusters: 0.32717267859903676
Coherence score with 10 clusters: 0.31403255693468285


In [31]:
# perform SVD on the bag of words with the LsiModel to extract 9 topics because of the highest coherence score
lsi = LsiModel(bow, num_topics=9, id2word=dictionary)

In [32]:
# find the 5 words with the srongest association to the derived topics
for topic_num, words in lsi.print_topics(num_words=5):
    print('Words in {}: {}.'.format(topic_num, words))

Words in 0: 0.283*"citi" + 0.197*"the" + 0.163*"town" + 0.150*"place" + 0.138*"it’".
Words in 1: 0.368*"citi" + -0.211*"town" + -0.199*"beach" + 0.151*"build" + 0.151*"mexico".
Words in 2: -0.280*"berlin" + 0.234*"mexico" + -0.179*"centuri" + 0.144*"site" + -0.142*"museum".
Words in 3: 0.307*"rio" + -0.172*"lake" + 0.155*"beach" + 0.155*"manau" + 0.147*"amazon".
Words in 4: 0.235*"madrid" + 0.213*"san" + -0.202*"berlin" + 0.201*"plaza" + -0.176*"manau".
Words in 5: 0.219*"beach" + -0.168*"chiapa" + -0.164*"maya" + -0.163*"berlin" + -0.142*"river".
Words in 6: 0.514*"lake" + 0.239*"trail" + 0.179*"vallei" + 0.166*"hike" + 0.156*"park".
Words in 7: 0.313*"beach" + -0.201*"river" + -0.161*"manau" + -0.159*"madrid" + -0.151*"amazon".
Words in 8: 0.206*"san" + 0.201*"berlin" + 0.160*"cabo" + 0.128*"beach" + 0.124*"lo".


In [None]:
# find the scores given between the place and each topic
corpus_lsi = lsi[bow]
score1 = []
score2 = []
score3 = []
score4 = []
score5 = []
score6 = []
score7 = []
score8 = []
score9 = []
for doc in corpus_lsi:
    score1.append(round(doc[0][1],2))
    score2.append(round(doc[1][1],2))
    score3.append(round(doc[2][1],2))
    score4.append(round(doc[3][1],2))
    score5.append(round(doc[4][1],2))
    score6.append(round(doc[5][1],2))
    score7.append(round(doc[6][1],2))
    score8.append(round(doc[7][1],2))
    score9.append(round(doc[8][1],2))


# create data frame that shows scores assigned for both topics for each review
df_topic = pd.DataFrame()
df_topic['Text'] = df['full_text']
df_topic['Topic 0 score'] = score1
df_topic['Topic 1 score'] = score2
df_topic['Topic 2 score'] = score3
df_topic['Topic 3 score'] = score4
df_topic['Topic 4 score'] = score5
df_topic['Topic 5 score'] = score6
df_topic['Topic 6 score'] = score7
df_topic['Topic 7 score'] = score8
df_topic['Topic 8 score'] = score9
df_topic['Topic']= df_topic[['Topic 0 score', 'Topic 1 score', 'Topic 2 score', 'Topic 3 score', 'Topic 4 score', 'Topic 5 score', 'Topic 6 score', 'Topic 7 score', 'Topic 8 score']].apply(lambda x: x.argmax(), axis=1)
df_topic.head(50)

In [34]:
# find a sample review from each topic
df_topic0 = df_topic[df_topic['Topic'] == 0]
df_topic1 = df_topic[df_topic['Topic']==1]
print('Sample text from topic 0:\n {}'.format(df_topic0.sample(1, random_state=2)['Text'].values))
print('\nSample text from topic 1:\n {}'.format(df_topic1.sample(1, random_state=2)['Text'].values))

Sample text from topic 0:
 ['A sprawling, hot and dusty city with over 400,000 inhabitants, PUCALLPA holds little of interest to travellers, most of whom get straight into a mototaxi or a local bus for Lago Yarinacocha. If you stay a while, though, it’s difficult not to appreciate Pucallpa’s relaxed feel – or the entrepreneurial optimism of this burgeoning jungle frontier city. Pucallpa’s annual festival for visitors – the Semana Turística de la Region Ucayali – is usually held in the last week of September, offering mostly artesanía and forest-produce markets, as well as folklore, music and dance.If you have an hour or so to while away in the town itself, both the downtown food market on Jirón Independencia and the older central market on Dos de Mayo are worth checking out; the latter in particular comprises varied stalls full of jungle produce. The port of La Hoyada and the older, nearby Puerto Italia are also bustling with activity by day. For craft shopping, artesanía can be found 

ValueError: a must be greater than 0 unless no samples are taken