In [120]:
import pandas as pd

seat_df = pd.read_csv('/content/drive/MyDrive/seat.csv')
lounge_df = pd.read_csv('/content/drive/MyDrive/lounge.csv')
airline_df = pd.read_csv('/content/drive/MyDrive/airline.csv')
airport_df = pd.read_csv('/content/drive/MyDrive/airport.csv')




#### Please run this code before running for all categories one


###### This is single category test i did for figuring things out

In [None]:

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


stop_words = set(stopwords.words('english'))


categories = ["overall", "queuing", "seat_comfort", "cabin", "staff", "value_for_money", "food", "shopping", "wifi", "hygiene", "entertainment"]


def tokenize(text):

    phrases_to_capture = [
        'not comfortable', 'not clean', 'not satisfied', 'not friendly',
        'very comfortable', 'excellent service', 'highly recommend', 'great experience',
        'poor quality', 'bad experience', 'disappointed'
    ]


    for phrase in phrases_to_capture:
        text = text.replace(phrase, phrase.replace(" ", "_"))


    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    tokens = text.split()


    new_tokens = []
    for token in tokens:
        if token not in stop_words:
            new_tokens.append(token)

    return new_tokens



In [141]:

mappings = {
    'lounge': {
        'overall_rating': 'overall',
        'comfort_rating': 'seat_comfort',
        'staff_service_rating': 'staff',
        'bar_beverages_rating': 'food',
        'wifi_connectivity_rating': 'wifi',
        'cleanliness_rating': 'hygiene',
    },
    'airport': {
        'overall_rating': 'overall',
        'queuing_rating': 'queuing',
        'terminal_seating_rating': 'seat_comfort',
        'airport_staff_rating': 'staff',
        'food_beverages_rating': 'food',
        'airport_shopping_rating': 'shopping',
        'wifi_connectivity_rating': 'wifi',
        'terminal_cleanliness_rating': 'hygiene',
    },
    'airline': {
        'overall_rating': 'overall',
        'seat_comfort_rating': 'seat_comfort',
        'cabin_staff_rating': 'staff',
        'value_money_rating': 'value_for_money',
        'food_beverages_rating': 'food',
        'wifi_connectivity_rating': 'wifi',
        'inflight_entertainment_rating': 'entertainment'
    },
    'seat': {
        'overall_rating': 'overall',
        'seat_legroom_rating': 'seat_comfort',
        'viewing_tv_rating': 'entertainment'
    }
}


In [None]:

def fill_missing_values(dataframe, relevant_columns):
    for column in relevant_columns:
        dataframe[column] = dataframe[column].fillna("Unknown")
    return dataframe


lounge_df_filled = fill_missing_values(lounge_df, ['comfort_rating', 'overall_rating'])
airport_df_filled = fill_missing_values(airport_df, ['terminal_seating_rating', 'overall_rating'])
airline_df_filled = fill_missing_values(airline_df, ['seat_comfort_rating', 'overall_rating'])
seat_df_filled = fill_missing_values(seat_df, ['seat_legroom_rating', 'overall_rating'])


print("Filled Lounge Dataset Rows:", lounge_df_filled.shape[0])
print("Filled Airport Dataset Rows:", airport_df_filled.shape[0])
print("Filled Airline Dataset Rows:", airline_df_filled.shape[0])
print("Filled Seat Dataset Rows:", seat_df_filled.shape[0])


In [None]:

word_counts = {cat: defaultdict(Counter) for cat in categories}
rating_counts = {cat: Counter() for cat in categories}

def process_reviews_all_categories(dataframe, text_column, mapping):
    for idx, row in dataframe.iterrows():
        for actual_col, category in mapping.items():
            if row[actual_col] != "Unknown":
                try:
                    rating = int(row[actual_col])
                    if rating < 1 or rating > 5:
                        continue
                    rating_counts[category][rating] += 1
                    words = tokenize(row[text_column])
                    for word in words:
                        word_counts[category][word][rating] += 1
                except ValueError:
                    pass


process_reviews_all_categories(lounge_df_filled, 'content', mappings['lounge'])
process_reviews_all_categories(airport_df_filled, 'content', mappings['airport'])
process_reviews_all_categories(airline_df_filled, 'content', mappings['airline'])
process_reviews_all_categories(seat_df_filled, 'content', mappings['seat'])


print("Word Counts for 'overall' category after processing:")
print(word_counts['overall'])

print("\nRating Counts for 'overall' category after processing:")
print(rating_counts['overall'])


In [None]:
prior_probs = {cat: {} for cat in categories}
for cat in categories:
    total_reviews = sum(rating_counts[cat].values())
    for rating, count in rating_counts[cat].items():
        prior_probs[cat][rating] = count / total_reviews

print("Prior probabilities for 'overall' category:")
print(prior_probs['overall'])


In [None]:
def compute_likelihood(word, rating, category):
    total_word_count = sum(word_counts[category][word].values())
    word_count_for_rating = word_counts[category][word][rating]

    if word_count_for_rating == 0:
        total_reviews_in_category = sum(rating_counts[category].values())
        return 1 / (total_reviews_in_category + 1e-6)
    total_reviews_in_category = sum(rating_counts[category].values())
    normalized_likelihood = word_count_for_rating / total_word_count

    return normalized_likelihood


test_phrase = "not_comfortable"
test_rating = 1
test_category = "seat_comfort"

likelihood = compute_likelihood(test_phrase, test_rating, test_category)
print(f"Likelihood of phrase '{test_phrase}' for rating {test_rating} in '{test_category}': {likelihood}")


FOR ALL CATEGORIES (PLEASE RUN PREVIOUS CODES BEFORE RUNNING THIS)

------------------------------------------------

In [148]:
def tokenize(text):

    phrases_to_capture = [
        'not comfortable', 'not clean', 'not satisfied', 'not friendly',
        'uncomfortable', 'very comfortable', 'excellent service', 'highly recommend',
        'great experience', 'poor quality', 'bad experience', 'disappointed', 'good'
    ]


    for phrase in phrases_to_capture:
        text = text.replace(phrase, phrase.replace(" ", "_"))


    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()


    parts_but = re.split(r'\s+but\s+', text)


    if len(parts_but) > 1:

        before_but = [token for token in parts_but[0].split() if token not in stop_words]
        after_but = [token for token in parts_but[1].split() if token not in stop_words]
        return before_but, after_but


    parts_and = re.split(r'\s+and\s+', text)
    combined_tokens = []
    for part in parts_and:
        part_tokens = [token for token in part.split() if token not in stop_words]
        combined_tokens.extend(part_tokens)

    return combined_tokens


In [149]:


mappings = {
    'lounge': {
        'overall_rating': 'overall',
        'comfort_rating': 'seat_comfort',
        'staff_service_rating': 'staff',
        'bar_beverages_rating': 'food',
        'wifi_connectivity_rating': 'wifi',
        'cleanliness_rating': 'hygiene',
    },
    'airport': {
        'overall_rating': 'overall',
        'queuing_rating': 'queuing',
        'terminal_seating_rating': 'seat_comfort',
        'airport_staff_rating': 'staff',
        'food_beverages_rating': 'food',
        'airport_shopping_rating': 'shopping',
        'wifi_connectivity_rating': 'wifi',
        'terminal_cleanliness_rating': 'hygiene',
    },
    'airline': {
        'overall_rating': 'overall',
        'seat_comfort_rating': 'seat_comfort',
        'cabin_staff_rating': 'staff',
        'value_money_rating': 'value_for_money',
        'food_beverages_rating': 'food',
        'wifi_connectivity_rating': 'wifi',
        'inflight_entertainment_rating': 'entertainment'
    },
    'seat': {
        'overall_rating': 'overall',
        'seat_legroom_rating': 'seat_comfort',
        'viewing_tv_rating': 'entertainment'
    }
}


In [None]:

def fill_missing_values(dataframe):
    for column in dataframe.columns:
        if dataframe[column].isnull().any():
            dataframe[column] = dataframe[column].fillna("Unknown")
    return dataframe


lounge_df_filled = fill_missing_values(lounge_df)
airport_df_filled = fill_missing_values(airport_df)
airline_df_filled = fill_missing_values(airline_df)
seat_df_filled = fill_missing_values(seat_df)


print("Filled Lounge Dataset Rows:", lounge_df_filled.shape[0])
print("Filled Airport Dataset Rows:", airport_df_filled.shape[0])
print("Filled Airline Dataset Rows:", airline_df_filled.shape[0])
print("Filled Seat Dataset Rows:", seat_df_filled.shape[0])


In [153]:
from collections import Counter
from collections import defaultdict

In [155]:

def process_reviews_all_categories(dataframe, text_column, mapping):
    for idx, row in dataframe.iterrows():
        for actual_col, category in mapping.items():
            if row[actual_col] != "Unknown":
                try:
                    rating = int(row[actual_col])
                    if rating < 1 or rating > 5:
                        continue
                    rating_counts[category][rating] += 1


                    tokens = tokenize(row[text_column])


                    if isinstance(tokens, tuple):
                        before_but, after_but = tokens

                        for word in before_but:
                            word_counts[category][word][rating] += 1
                        for word in after_but:
                            word_counts[category][word][rating] += 2
                    else:

                        for word in tokens:
                            word_counts[category][word][rating] += 1
                except ValueError:
                    pass


In [None]:

prior_probs = {cat: {} for cat in categories}
for cat in categories:
    total_reviews = sum(rating_counts[cat].values())
    for rating, count in rating_counts[cat].items():
        prior_probs[cat][rating] = count / total_reviews if total_reviews > 0 else 0


print("Prior probabilities for 'overall' category:")
print(prior_probs['overall'])


In [158]:

def compute_likelihood(word, rating, category):
    if word not in word_counts[category]:
        return 1e-6

    total_word_count = sum(word_counts[category][word].values())
    word_count_for_rating = word_counts[category][word][rating]

    if total_word_count == 0:
        return 1e-6


    normalized_likelihood = word_count_for_rating / total_word_count
    return normalized_likelihood if normalized_likelihood > 0 else 1e-6


In [159]:
import math

In [161]:

def predict_rating(review_text, category):
    tokens = tokenize(review_text)

    best_rating = None
    max_log_posterior = -float('inf')

    for rating in prior_probs[category]:
        log_prior = math.log(prior_probs[category][rating]) if prior_probs[category][rating] > 0 else -float('inf')
        log_likelihood = 0


        if isinstance(tokens, tuple):
            before_but, after_but = tokens

            for word in before_but:
                likelihood = compute_likelihood(word, rating, category)
                if likelihood > 0:
                    log_likelihood += math.log(likelihood)
            for word in after_but:
                likelihood = compute_likelihood(word, rating, category)
                if likelihood > 0:
                    log_likelihood += 2 * math.log(likelihood)
        else:

            for word in tokens:
                likelihood = compute_likelihood(word, rating, category)
                if likelihood > 0:
                    log_likelihood += math.log(likelihood)


        log_posterior = log_prior + log_likelihood

        if log_posterior > max_log_posterior:
            max_log_posterior = log_posterior
            best_rating = rating

    return best_rating


In [None]:

def predict_ratings_for_all_categories(review_text):
    predicted_ratings = {}
    sentiment_score = 0
    total_weight = 0


    for category in categories:
        best_rating = predict_rating(review_text, category)
        predicted_ratings[category] = best_rating


        if best_rating is not None:
            if best_rating >= 4:
                sentiment_score += 1 * best_rating
            elif best_rating <= 2:
                sentiment_score -= 1 * (3 - best_rating)
            total_weight += 1


    if total_weight > 0:
        overall_score = sentiment_score / total_weight


        if overall_score >= 4.5:
            predicted_ratings['overall'] = 5
        elif overall_score >= 3.5:
            predicted_ratings['overall'] = 4
        elif overall_score >= 2.5:
            predicted_ratings['overall'] = 3
        elif overall_score >= 1.5:
            predicted_ratings['overall'] = 2
        else:
            predicted_ratings['overall'] = 1
    else:
        predicted_ratings['overall'] = 3

    return predicted_ratings


mixed_reviews = ["The food was poor quality, and the wifi didn't work at all. The staff was not friendly."]


for review in mixed_reviews:
    predicted_ratings = predict_ratings_for_all_categories(review)
    print(f"Review: {review}")
    print(f"Predicted Ratings: {predicted_ratings}\n")
