In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.decomposition import NMF


class get_concerns:
    
    def __init__(self, data_path):
        
        self.trip_review_data_path  = data_path
        self.read_data()
        self.pre_process()
        self.analyze_sentiment()
        self.feature_extraction()
        self.model_training_and_evaluation()
        self.topic_modelling()
        
    def preprocess_text(self, text):
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\d+','', text)
        text = text.lower()
        text = text.strip()
        tokens = text.split()
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords.words('english')]
        return ' '.join(tokens)
    
    def create_sentiment(self, text):
        
        analysis=TextBlob(text)
        if analysis.sentiment.polarity>0:
            return 'Positive'
        elif analysis.sentiment.polarity<0:
            return 'Negative'
        else:
            return 'Neutral'
        
    def read_data(self):
        
        self.trip_review_data = pd.read_csv(self.trip_review_data_path)
        
    def pre_process(self):
        
        self.trip_review_data['cleaned_review'] = [str(self.preprocess_text(text))
                                                   for text in self.trip_review_data.Review]
        
    def analyze_sentiment(self):
        
        self.trip_review_data['sentiment']=[str(self.create_sentiment(x))
                           for x in self.trip_review_data.Review]
        print(self.trip_review_data['sentiment'].value_counts())
        
    def feature_extraction(self):
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.trip_review_data['cleaned_review'], 
                                                                                self.trip_review_data['sentiment'], 
                                                                                test_size=0.2, random_state=42)
        # Vectorize text
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.X_train = self.vectorizer.fit_transform(self.X_train)
        self.X_test = self.vectorizer.transform(self.X_test)

    def model_training_and_evaluation(self):
        
        model = SVC(kernel= 'linear')
        model.fit(self.X_train, self.y_train)

        # Evaluate model
        self.y_pred = model.predict(self.X_test)
        print(classification_report(self.y_test, self.y_pred))
        
    def topic_modelling(self):
        print('--------------------------The Overall Key Aspects are:----------------------------------------')
        nmf = NMF(n_components=10, random_state=42)
        X_train_nmf = nmf.fit_transform(self.X_train)
        
        def display_topics(model, feature_names, no_top_words):
            for topic_idx, topic in enumerate(model.components_):
                print("Topic %d:" % (topic_idx))
                print(" ".join([feature_names[i]
                                for i in topic.argsort()[:-no_top_words - 1:-1]]))

        no_top_words = 10
        feature_names = self.vectorizer.get_feature_names_out()
        display_topics(nmf, feature_names, no_top_words)

        print('--------------------------The negative key aspects are:-------------------------------------')
        negative_reviews = self.trip_review_data[self.trip_review_data['sentiment'] == 'Negative']
        if not negative_reviews.empty:
            # Apply TF-IDF vectorization
            X_tfidf = self.vectorizer.transform(negative_reviews['cleaned_review'])

            # Topic modeling with NMF
            nmf = NMF(n_components=10, random_state=42)
            X_nmf = nmf.fit_transform(X_tfidf)

            # Get top keywords for each topic
            feature_names = self.vectorizer.get_feature_names_out()
            no_top_words = 10
            for topic_idx, topic in enumerate(nmf.components_):
                print(f"Topic #{topic_idx}:")
                print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        else:
            print("No negative reviews found.")


In [27]:
data_path = r"C:\Users\sadin\Desktop\Prudent\NLP\tripadvisor_hotel_reviews.csv"
get_concerns(data_path)

Positive    19112
Negative     1356
Neutral        23
Name: sentiment, dtype: int64


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    Negative       0.82      0.55      0.66       256
     Neutral       0.00      0.00      0.00        12
    Positive       0.97      0.99      0.98      3831

    accuracy                           0.96      4099
   macro avg       0.60      0.51      0.55      4099
weighted avg       0.96      0.96      0.96      4099

--------------------------The Overall Key Aspects are:----------------------------------------
Topic 0:
service room day desk told time check arrived got asked
Topic 1:
resort beach pool food beautiful vacation restaurant people drink water
Topic 2:
great location staff service time really value place view stay
Topic 3:
walk station minute metro bus city train euro street min
Topic 4:
room bed floor view bathroom small night shower window large
Topic 5:
nt ca really place like people want thing time review
Topic 6:
hotel star stayed best recommend staying located city booked business
Topic 7:
good value location

<__main__.get_concerns at 0x1a320988f40>