In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

reviews_df = pd.read_csv('Data/books_and_reviews.csv', usecols=['title', 'review_summary'])
reviews_df.head(10)

Unnamed: 0,title,review_summary
0,Its Only Art If Its Well Hung!,Nice collection of Julie Strain images
1,Dr. Seuss: American Icon,Really Enjoyed It
2,Dr. Seuss: American Icon,Essential for every personal and Public Library
3,Dr. Seuss: American Icon,Phlip Nel gives silly Seuss a serious treatment
4,Dr. Seuss: American Icon,Good academic overview
5,Dr. Seuss: American Icon,One of America's greatest creative talents
6,Dr. Seuss: American Icon,A memorably excellent survey of Dr. Seuss' man...
7,Dr. Seuss: American Icon,Academia At It's Best
8,Dr. Seuss: American Icon,And to think that I read it on the tram!
9,Dr. Seuss: American Icon,Fascinating account of a genius at work


In [6]:
# we need topics for each review which summarise them. We have the function below which takes the model which we will define later, 
# the kinds of words or terms that correspond to topics and a count of representative words or topics we want. It then iterates through the 
# model giving an index to each topic and assigning each item in the model a topic and finally printing these results.

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [9]:
# our main function for performing topic modelling

def main_topics(reviews_df):
    # CountVectorizer converts the text into a term-document matrix. Rows correspond to documents and columns to words and looks at frequency of words.
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(reviews_df['review_summary'])
    # LDA is the topic modelling function which finds topics in documents - we have specified 5 topics per document
    lda = LatentDirichletAllocation(n_components=5, random_state=0) 
    lda.fit(doc_term_matrix)

    # we then use the previous function to display the top 10 words given to each of the 5 topics
    no_top_words = 10
    display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)
    # This function finds the probability of each review belonging to a topic in a matrix with each review being a row and each column a topic
    topic_values = lda.transform(doc_term_matrix)
    reviews_df['Main_Topic'] = topic_values.argmax(axis=1)

    reviews_df.to_csv('Data/Books_with_Topics.csv', index=False)

    print(reviews_df.head())

In [10]:
# testing our function on the top 1000 reviews as our file is currently too large to run quickly
test = reviews_df.head(1000)

review_main_topics = main_topics(test)

print(review_main_topics)

Topic 0:
book read best awesome world night books ve favorite way
Topic 1:
good excellent book reference mystery complete interesting life ok medicine
Topic 2:
great book unusual cruel sourdough just fun fabulous reading review
Topic 3:
time quot don waste love food loved little information useful
Topic 4:
love series wonderful story best guide curse scarletti smith scarpetta
                            title  \
0  Its Only Art If Its Well Hung!   
1        Dr. Seuss: American Icon   
2        Dr. Seuss: American Icon   
3        Dr. Seuss: American Icon   
4        Dr. Seuss: American Icon   

                                    review_summary  Main_Topic  
0           Nice collection of Julie Strain images           0  
1                                Really Enjoyed It           4  
2  Essential for every personal and Public Library           3  
3  Phlip Nel gives silly Seuss a serious treatment           4  
4                           Good academic overview           1  
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df['Main_Topic'] = topic_values.argmax(axis=1)
