In [9]:
import pandas as pd
import numpy as np
from nltk.stem import LancasterStemmer  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


In [None]:
df = pd.read_csv("hotel_reviews_clean.csv")


ParserError: Error tokenizing data. C error: Expected 3 fields in line 17, saw 4


In [None]:
df.head()

Unnamed: 0,Review,Score
0,"The hotel was fantastic, I had an amazing expe...",5
1,Terrible stay. The staff was rude and unhelpful.,3
2,Loved the location! The hotel was clean and co...,4
3,The worst experience ever. Will never book again.,4
4,"Amazing service, very friendly staff, and grea...",2


In [None]:
df.tail()

Unnamed: 0,Review,Score
995,"Noisy, uncomfortable, and overpriced. Very dis...",3
996,"The breakfast was delicious, and the staff was...",4
997,Never coming back. Bad customer service and un...,1
998,"Spacious rooms, comfortable beds, and a great ...",2
999,"The check-in process was slow, and the recepti...",2


In [None]:
stemmer = LancasterStemmer()
df["stemmed_reviews"] = df["Review"].astype(str).apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))


In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["stemmed_reviews"])
y = df["Score"]  # Target variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:

y_pred = model.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")

Mean Absolute Error: 1.30


In [None]:
def predict_review(review):
    stemmed_review = " ".join([stemmer.stem(word) for word in review.split()])
    review_vectorized = vectorizer.transform([stemmed_review])
    predicted_score = model.predict(review_vectorized)[0]
    return round(predicted_score, 1)

In [None]:
new_reviews = [
    "The hotel was fantastic, very clean and comfortable!",
    "Terrible experience, the staff was rude and unhelpful.",
    "Decent hotel, but overpriced for what you get."
]

In [None]:
for review in new_reviews:
    print(f"Review: {review}\nPredicted Rating: {predict_review(review)}\n")

Review: The hotel was fantastic, very clean and comfortable!
Predicted Rating: 3.0

Review: Terrible experience, the staff was rude and unhelpful.
Predicted Rating: 3.0

Review: Decent hotel, but overpriced for what you get.
Predicted Rating: 3.2



In [None]:
# Predict ratings for all reviews in the dataset
df["predicted_rating"] = df["Review"].apply(lambda x: predict_review(x))


df_result = df[["Review", "Score", "predicted_rating"]]


print(df_result.head())  # Displaying the first few rows
print(df_result.tail())  # Displaying the first few rows


                                              Review  Score  predicted_rating
0  The hotel was fantastic, I had an amazing expe...      5               3.6
1   Terrible stay. The staff was rude and unhelpful.      3               2.8
2  Loved the location! The hotel was clean and co...      4               2.7
3  The worst experience ever. Will never book again.      4               2.6
4  Amazing service, very friendly staff, and grea...      2               2.9
                                                Review  Score  \
995  Noisy, uncomfortable, and overpriced. Very dis...      3   
996  The breakfast was delicious, and the staff was...      4   
997  Never coming back. Bad customer service and un...      1   
998  Spacious rooms, comfortable beds, and a great ...      2   
999  The check-in process was slow, and the recepti...      2   

     predicted_rating  
995               3.1  
996               3.1  
997               2.8  
998               3.0  
999               2.8