In [1]:
import pandas as pd

# load data
df = pd.read_csv('content/data_prep_2805_3.csv', usecols=['place', 'full_text'])
df.head()

Unnamed: 0,place,full_text
0,Waikiki,"Built on a reclaimed swamp, two miles east of ..."
1,The Florida Keys,"Folklore, films and widespread hearsay have gi..."
2,Yellowstone National Park,America’s oldest and easily its most famous na...
3,The Big Island,Although the Big Island of Hawaii could hold a...
4,The Great Plains,The rolling hills and vast grasslands of the G...


In [2]:
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation \
                                        , preprocess_string, strip_short, stem_text

# preprocess given text
def preprocess(text):

    # clean text based on given filters
    CUSTOM_FILTERS = [lambda x: x.lower(),
                                remove_stopwords,
                                strip_punctuation,
                                strip_short,
                                stem_text]
    text = preprocess_string(text, CUSTOM_FILTERS)

    return text

# apply function to all reviews
df['Text (Clean)'] = df['full_text'].apply(lambda x: preprocess(x))

In [3]:
# preview of dataset
df.head()

Unnamed: 0,place,full_text,Text (Clean)
0,Waikiki,"Built on a reclaimed swamp, two miles east of ...","[built, reclaim, swamp, mile, east, downtown, ..."
1,The Florida Keys,"Folklore, films and widespread hearsay have gi...","[folklor, film, widespread, hearsai, given, fl..."
2,Yellowstone National Park,America’s oldest and easily its most famous na...,"[america’, oldest, easili, famou, nation, park..."
3,The Big Island,Although the Big Island of Hawaii could hold a...,"[big, island, hawaii, hold, island, room, spar..."
4,The Great Plains,The rolling hills and vast grasslands of the G...,"[roll, hill, vast, grassland, great, plain, ho..."


In [4]:
# convert these processed reviews into a document-term matrix with the bag of words model
from gensim import corpora

# create a dictionary with the corpus
corpus = df['Text (Clean)']
dictionary = corpora.Dictionary(corpus)

# convert corpus into a bag of words
bow = [dictionary.doc2bow(text) for text in corpus]


In [18]:
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

# find the coherence score with a different number of topics
for i in range(2,5):
    lsi = LsiModel(bow, num_topics=i, id2word=dictionary)
    coherence_model = CoherenceModel(model=lsi, texts=df['Text (Clean)'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print('Coherence score with {} clusters: {}'.format(i, coherence_score))

Coherence score with 2 clusters: 0.36221930499264315
Coherence score with 3 clusters: 0.38205883213620023
Coherence score with 4 clusters: 0.3623648227449483


In [12]:
# perform SVD on the bag of words with the LsiModel to extract 9 topics because of the highest coherence score
lsi = LsiModel(bow, num_topics=3, id2word=dictionary)

In [13]:
# find the 5 words with the srongest association to the derived topics
for topic_num, words in lsi.print_topics(num_words=5):
    print('Words in {}: {}.'.format(topic_num, words))

Words in 0: 0.283*"citi" + 0.197*"the" + 0.163*"town" + 0.150*"place" + 0.138*"it’".
Words in 1: 0.368*"citi" + -0.211*"town" + -0.199*"beach" + 0.151*"build" + 0.151*"mexico".
Words in 2: -0.280*"berlin" + 0.234*"mexico" + -0.179*"centuri" + 0.144*"site" + -0.142*"museum".


In [16]:
# find the scores given between the place and each topic
corpus_lsi = lsi[bow]
score1 = []
score2 = []
score3 = []
#score4 = []
#score5 = []
#score6 = []
#score7 = []
#score8 = []
#score9 = []
for doc in corpus_lsi:
    score1.append(round(doc[0][1],2))
    score2.append(round(doc[1][1],2))
    score3.append(round(doc[2][1],2))
    #score4.append(round(doc[3][1],2))
    #score5.append(round(doc[4][1],2))
    #score6.append(round(doc[5][1],2))
    #score7.append(round(doc[6][1],2))
    #score8.append(round(doc[7][1],2))
    #score9.append(round(doc[8][1],2))


# create data frame that shows scores assigned for both topics for each review
df_topic = pd.DataFrame()
df_topic['Text'] = df['full_text']
df_topic['Topic 0 score'] = score1
df_topic['Topic 1 score'] = score2
df_topic['Topic 2 score'] = score3
#df_topic['Topic 3 score'] = score4
#df_topic['Topic 4 score'] = score5
#df_topic['Topic 5 score'] = score6
#df_topic['Topic 6 score'] = score7
#df_topic['Topic 7 score'] = score8
#df_topic['Topic 8 score'] = score9
df_topic['Topic']= df_topic[['Topic 0 score', 'Topic 1 score', 'Topic 2 score']].apply(lambda x: x.argmax(), axis=1)
# , 'Topic 3 score', 'Topic 4 score', 'Topic 5 score', 'Topic 6 score', 'Topic 7 score', 'Topic 8 score'
df_topic.head(50)

Unnamed: 0,Text,Topic 0 score,Topic 1 score,Topic 2 score,Topic
0,"Built on a reclaimed swamp, two miles east of ...",2.59,-0.5,-0.25,0
1,"Folklore, films and widespread hearsay have gi...",4.06,-1.03,-0.6,0
2,America’s oldest and easily its most famous na...,2.09,-0.88,-0.41,0
3,Although the Big Island of Hawaii could hold a...,6.88,-4.31,-0.71,0
4,The rolling hills and vast grasslands of the G...,5.32,-1.37,0.06,0
5,Only when you traverse the Rocky Mountain stat...,6.03,-2.79,0.4,0
6,One of America’s oldest and most beautiful cit...,5.99,1.13,-0.8,0
7,"The city of Washington, in the District of Col...",6.09,-1.2,-0.64,0
8,The classic southwestern landscape of stark sa...,3.19,-1.46,-0.49,0
9,"With its soaring cliffs, riverine forests and ...",2.98,-1.62,-0.17,0


In [9]:
# find a Text from each topic
df_topic0 = df_topic[df_topic['Topic'] == 0]
df_topic1 = df_topic[df_topic['Topic']==1]
print('Sample text from topic 0:\n {}'.format(df_topic0.sample(1, random_state=2)['Text'].values))
print('\nSample text from topic 1:\n {}'.format(df_topic1.sample(1, random_state=2)['Text'].values))

Sample text from topic 0:
 ['Volcanic in origin, the Aeolian Islands are named after Aeolus, the Greek god who kept the winds he controlled shut tight in one of the islands’ many caves. According to Homer, Odysseus put into the Aeolians and was given a bag of wind to help him home, but his sailors opened it too soon and the ship was blown straight back to port. More verifiably, the islands were coveted for their mineral wealth, the mining of obsidian (hard, glass-like lava) providing the basis for early prosperity, because it was the sharpest material available until people learned the art of smelting metals. Later their strategic importance attracted the Greeks, who settled on Lipari in 580 BC, but they later became a haven for pirates and a place of exile, a state of affairs that continued right into the twentieth century with the Fascists exiling their political opponents to Lipari. The twentieth century saw mass emigration, mostly to Australia, and even now islands such as Panarea 

ValueError: a must be greater than 0 unless no samples are taken