# LIBRARIES

Import the required libraries

In [None]:
import pandas as pd
import gensim
import heapq
import numpy as np
import nltk
import io
import enchant
import sentlex
import sentlex.sentanalysis
import re
import preprocess1

# DATA IMPORT AND CORPUS CREATION

Read the dataset

In [None]:
hotel=pd.read_csv("Data\Reviews.csv")
print(hotel.head(1))

Sort the imported data frame by HOTEL NAME

In [None]:
hotel = hotel.sort_values('name', ascending=True)

Split all the reviews into sentences.

Run a FOR loop on all rows of the dataset to create one txt file for each row with the data from the Review column

In [None]:
columns = ['name','review_sent']
hotel_sents_df = pd.DataFrame(index=list(range(0,500000)), columns=columns)
#hotel_sents_df

In [None]:
i=0
j=0
k=0
hotel['review'].fillna(" ", inplace=True)
nrows = len(hotel)-1
nrows

In [None]:
for i in range(0,nrows):
    hotel_sents= []
    hotel_sents=nltk.tokenize.sent_tokenize(hotel['review'][i])
    j=0
    for j in range(0,len(hotel_sents)):
        hotel_sents_df['name'][k] = hotel['name'][i]
        hotel_sents_df['review_sent'][k] = hotel_sents[j]
        j+=1
        k+=1
    i+=1
    #print('i',i)
    #print('j',j)
    #print('k',k)

In [None]:
hotel_sents_df.dropna(subset=['name'], how='all', inplace = True)

In [None]:
i=0
j=1
hold_hotel = ' '
len(hotel_sents_df)

In [None]:
for i in range(0,len(hotel_sents_df)-1):
    if  hold_hotel != hotel_sents_df['name'][i]:
        j=1 
        hold_hotel = hotel_sents_df['name'][i]
    f = open(str(hotel_sents_df['name'][i])+" - "+str(j)+'.txt', 'w+')
    f.write(str(hotel_sents_df['review_sent'][i]))
    f.close()
    #print(i)
    i+=1
    j+=1

All the text files will be generated where ipynb file is placed. 

Create a folder called Hotel Corpus Sents and move all text files into this.

Read the created corpus folder of 100k review sentence files

In [None]:
hotel_corpus = preprocess1.load_corpus('./Hotel_Corpus_Sents')
hotel_corpus.fileids()

# DATA PREPARATION

Perform the pre processing steps on the corpus:
    
    1. Tokenization
    2. Case conversion to lower
    3. Removal of non alphabetic characters
    4. Stop word removal
    5. Non English words removal
    6. Stemming

In [None]:
stop_list = nltk.corpus.stopwords.words('english')
new_stop_words = ['hotel','room','negative','good','great','love','recommend','grove']
for i in range(0,len(new_stop_words)):
    stop_list.append(new_stop_words[i])
stemmer = nltk.stem.porter.PorterStemmer()
d = enchant.Dict("en_US")

In [None]:
fids = hotel_corpus.fileids()
docs1 = []
for fid in fids:
    doc_raw = hotel_corpus.raw(fid)
    doc = nltk.word_tokenize(doc_raw)
    docs1.append(doc)
docs2 = [[w.lower() for w in doc] for doc in docs1]
docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs2]
docs4 = [[w for w in doc if w not in stop_list] for doc in docs3]
docs5 = [[w for w in doc if d.check(w)] for doc in docs4]
hotel_docs = [[stemmer.stem(w) for w in doc] for doc in docs5]

1. Create the corpus dictionary.
2. Create the document sparse matrix representations for all the documents in the corpus.
3. Store the file ids of the corpus documents.

In [None]:
hotel_docs

In [None]:
hotel_dictionary = gensim.corpora.Dictionary(hotel_docs)
hotel_vecs = preprocess1.docs2vecs(hotel_docs, hotel_dictionary)
fids = hotel_corpus.fileids()

Validate the number of files in the corpus

In [None]:
len(fids)

# LATENT DIRICHLET ALLOCATION - TOPIC MODELLING

Run LDA for topic modelling with number of topics to identify = 7.

In [None]:
hotel_lda = gensim.models.ldamodel.LdaModel(corpus=hotel_vecs, id2word=hotel_dictionary, num_topics=7)

View the top 20 words by TOPIC-WORD distribution for each identified topic

In [None]:
topics = hotel_lda.show_topics(7, 10)

for i in range(0, 7):
    print(topics[i])

Save the generated LDA model

In [None]:
hotel_lda.save('lda.model')

Reload the saved model and view the top 20 words of each topic to validate the consistency.

In [None]:
model = gensim.models.LdaModel.load('lda.model')
model.show_topics(7,20)

Create a null dataframe with N rows where N is the number of documents in the corpus to store the DOCUMENT-TOPIC distribution

In [None]:
columns = ['File_ID']
Doc_Topic_Dist = pd.DataFrame(index=list(range(0,len(fids))), columns=columns)
#print(Doc_Topic_Dist)

Generate and store the document topic distribution for each document

In [None]:
i=0
j=0
k=0

In [None]:
for i, val in enumerate(fids):
    index_of_file = fids.index(fids[i])
    vec = hotel_vecs[index_of_file]
    vec_lda = model[vec]
    #print(vec_lda)
    #print(i)
    #print (val)
    Doc_Topic_Dist.loc[i,"File_ID"] = fids[i]
    for j,k in vec_lda:
        Doc_Topic_Dist.loc[i,j] = vec_lda[j][1]

Rename columns with the topic labels

In [None]:
Doc_Topic_Dist.columns = ["File_ID","Location","Staff","Value For Money","Food","NA","NA2","Amenities"]
print(Doc_Topic_Dist)

Create a null dataframe to store the top two topics by distribution for each document

In [None]:
columns = ['File_ID']
Doc_Topic_Dist_Top2 = pd.DataFrame(index=list(range(0,len(fids))), columns=columns)

Remove the FILE ID column temporarily to identify the top topics for each document

In [None]:
Doc_Topic_Dist_Sub = Doc_Topic_Dist.iloc[:,1:8]
Doc_Topic_Dist

In [None]:
hotel_corpus.raw(fileids='1785 Inn - 10.txt')
hotel_corpus.raw(fileids='1785 Inn - 32.txt')

Rank the topic distributions and pick the top 2 to represent the document

In [None]:
for t in range(0,len(Doc_Topic_Dist)):
    Top_Topics = heapq.nlargest(2, Doc_Topic_Dist_Sub.iloc[t])
    Doc_Topic_Dist_Top2.loc[t,1] = Top_Topics[0]
    Doc_Topic_Dist_Top2.loc[t,2] = Top_Topics[1]

Re-attach the FILE ID column for readability

In [None]:
Doc_Topic_Dist_Top2['File_ID'] = Doc_Topic_Dist['File_ID']
print(Doc_Topic_Dist_Top2)

Identify the labels of the top two topics for each document, create a null dataframe to store it

In [None]:
columns = ['File_ID','Top1_Topic','Top2_Topic']
Doc_Topic_Top2 = pd.DataFrame(index=list(range(0,len(fids))), columns=columns)
Doc_Topic_Top2

Remove the FILE ID column temporarily to identify the top topics for each document

In [None]:
Doc_Topic_Dist_Sub2 = Doc_Topic_Dist.iloc[:,1:8]

Rank the topic labels and pick the top 2 to represent the document

In [None]:
for t in range(0,len(Doc_Topic_Dist_Sub2)):
    Top_Topics = heapq.nlargest(2, Doc_Topic_Dist_Sub2.iloc[t])
    Top1_Index = np.where(Doc_Topic_Dist_Sub2.iloc[[t]] == Top_Topics[0])
    Top2_Index = np.where(Doc_Topic_Dist_Sub2.iloc[[t]] == Top_Topics[1])
    Top1 = Doc_Topic_Dist_Sub2.columns[Top1_Index[1]]
    Top2 = Doc_Topic_Dist_Sub2.columns[Top2_Index[1]]
    Doc_Topic_Top2.loc[t,"Top1_Topic"] = Top1[0]
    Doc_Topic_Top2.loc[t,"Top2_Topic"] = Top2[0]

Re-attach the FILE ID column for readability

In [None]:
Doc_Topic_Top2['File_ID'] = Doc_Topic_Dist['File_ID']

In [None]:
print(Doc_Topic_Top2)

# SENTIMENT ANALYSIS 

In [None]:
SWN = sentlex.SWN3Lexicon()
classifier = sentlex.sentanalysis.BasicDocSentiScore()

In [None]:
columns = ['File_ID','Positive Score','Negative Score']
Doc_Senti = pd.DataFrame(index=list(range(0,len(fids))), columns=columns)
classifier_result=(0,0)
idx = 0

In [None]:
for fil in fids:
    classifier_result=classifier.classify_document(hotel_corpus.raw(fileids=fil), tagged=False, L=SWN, a=True, v=True, n=False, r=False, negation=True, verbose=False)
    Doc_Senti['File_ID'][idx] = fil
    Doc_Senti['Positive Score'][idx] = classifier_result[0]
    Doc_Senti['Negative Score'][idx] = classifier_result[1]
    idx+=1
    #print(idx)

In [None]:
Doc_Sent

In [None]:
print(Doc_Sent[Doc_Sent['Positive Score'] < Doc_Sent['Negative Score']])