In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from scipy.sparse import hstack 

In [2]:
df_train = pd.read_csv("./dataset/train_preprocessed_v4.csv")
df_test = pd.read_csv("./dataset/test_preprocessed_v4.csv")

df = pd.concat([df_train, df_test], axis=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 556871 entries, 0 to 278435
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0.2       556871 non-null  int64  
 1   Unnamed: 0.1       556871 non-null  int64  
 2   Unnamed: 0         556871 non-null  int64  
 3   hotel_name         556871 non-null  object 
 4   hotel_location     540465 non-null  object 
 5   review_id          556871 non-null  object 
 6   review_date        556593 non-null  object 
 7   review_text        556871 non-null  object 
 8   review_language    556871 non-null  object 
 9   review_rating      278435 non-null  float64
 10  hotel_rating_mean  556382 non-null  float64
dtypes: float64(2), int64(3), object(6)
memory usage: 51.0+ MB


In [4]:
vectorizer = TfidfVectorizer()

In [5]:
X = vectorizer.fit_transform(df['review_text'])

In [6]:
vectorizer.vocabulary_

{'thi': 81401,
 'new': 56540,
 'hotel': 41981,
 'great': 38740,
 'staff': 76859,
 'love': 50479,
 'interact': 44487,
 'peggi': 61492,
 'wish': 98180,
 'salad': 70801,
 'veget': 94895,
 'option': 59293,
 'good': 38245,
 'menu': 52987,
 'room': 69752,
 'clean': 20199,
 'nice': 56669,
 'decor': 25211,
 'anyth': 9299,
 'el': 29864,
 'say': 71376,
 'trip': 83661,
 'advisor': 6765,
 'make': 51389,
 'write': 98851,
 'least': 48491,
 '200': 1817,
 'word': 98528,
 'attent': 10892,
 'welcom': 97014,
 'comfort': 21275,
 'locat': 49709,
 'cozi': 23557,
 'space': 75900,
 'chill': 19390,
 'much': 55147,
 'varieti': 94757,
 'food': 35225,
 'avail': 11271,
 'ha': 39496,
 'certainli': 18540,
 'benefit': 13663,
 'invest': 44758,
 'owner': 60245,
 'longer': 50166,
 'mousetrap': 55001,
 'wa': 95966,
 'onc': 58864,
 'small': 74792,
 'side': 73759,
 'expect': 32455,
 'long': 50156,
 'lobbi': 49619,
 'marvel': 52164,
 'champagn': 18719,
 'bar': 12234,
 'romant': 69668,
 'stop': 77867,
 'friendli': 36150,
 'h

In [6]:
from scipy.sparse import csr_matrix

# Assuming your_sparse_matrix is the sparse matrix you have
# Splitting the matrix into two parts based on row indices
X_train = X[:278435]
X_test = X[278435:]

In [8]:
X_train = hstack([X_train, df_train.loc[:, 'hotel_rating_mean'].values.reshape(-1, 1)])
X_test = hstack([X_test, df_test.loc[:, 'hotel_rating_mean'].values.reshape(-1, 1)])

In [9]:
y_train = df_train['review_rating']

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=.2, random_state=42)

In [11]:
# XGBoost regression model
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

In [12]:
xg_reg.fit(X_train, y_train)

In [13]:
pred = xg_reg.predict(X_valid)

In [14]:
# Evaluating the model
rmse = mean_squared_error(y_valid, pred, squared=False)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 1.2327800394713297


In [15]:
pred_test = xg_reg.predict(X_test)

In [16]:
submission = pd.DataFrame()
submission['review_id'] = df_test['review_id']
submission['review_rating'] = pred_test

submission.head()    

Unnamed: 0,review_id,review_rating
0,9p62eIN5NkEadRe7,7.817492
1,WfQozacJ1lzXMFQg,9.447377
2,oahzyhSR3ZNHJBuI,9.228675
3,lcO2pUMVQ09RmbHZ,6.63496
4,DmMu1z2SozPxykUS,8.797159


In [17]:
submission.to_csv(r'submission/submission_4.csv', index=False)