In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from scipy.sparse import hstack 

In [2]:
df_train = pd.read_csv("./dataset/train_preprocessed_v6.csv")
df_test = pd.read_csv("./dataset/test_preprocessed_v6.csv")

df = pd.concat([df_train, df_test], axis=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 556871 entries, 0 to 278435
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0.1       556871 non-null  int64  
 1   Unnamed: 0         556871 non-null  int64  
 2   hotel_name         556871 non-null  object 
 3   hotel_location     540465 non-null  object 
 4   review_id          556871 non-null  object 
 5   review_date        556593 non-null  object 
 6   review_text        556871 non-null  object 
 7   review_language    556871 non-null  object 
 8   review_rating      278435 non-null  float64
 9   hotel_rating_mean  556382 non-null  float64
dtypes: float64(2), int64(2), object(6)
memory usage: 46.7+ MB


In [4]:
vectorizer = TfidfVectorizer()

In [5]:
X = vectorizer.fit_transform(df['review_text'])

In [6]:
vectorizer.vocabulary_

{'new': 68632,
 'hotel': 51060,
 'great': 47210,
 'staff': 93962,
 'loved': 61290,
 'interacting': 54379,
 'peggy': 74574,
 'wish': 116225,
 'salad': 86627,
 'vegetable': 112428,
 'option': 71888,
 'good': 46618,
 'menyou': 64430,
 'room': 85376,
 'clean': 23641,
 'nicely': 68827,
 'decorated': 30274,
 'trip': 102305,
 'advisor': 7477,
 'yoyou': 119862,
 'write': 117014,
 '200': 1911,
 'word': 116625,
 'lovely': 61313,
 'attentive': 12512,
 'welcoming': 114933,
 'comfortable': 25011,
 'location': 60550,
 'cozy': 28192,
 'space': 92770,
 'chill': 22640,
 'myouch': 67074,
 'variety': 112273,
 'food': 43023,
 'available': 12745,
 'ha': 48124,
 'certainly': 21624,
 'benefited': 15782,
 'investment': 54802,
 'owner': 72748,
 'longer': 61022,
 'moyousetrap': 66897,
 'wa': 113675,
 'small': 91439,
 'byout': 19603,
 'expect': 39441,
 'long': 61010,
 'lobby': 60395,
 'marveloyous': 63436,
 'champagne': 21833,
 'bar': 14129,
 'romantic': 85285,
 'stop': 95171,
 'friendly': 44140,
 'helpfyoul': 4

In [6]:
from scipy.sparse import csr_matrix

# Assuming your_sparse_matrix is the sparse matrix you have
# Splitting the matrix into two parts based on row indices
X_train = X[:278435]
X_test = X[278435:]

In [7]:
X_train = hstack([X_train, df_train.loc[:, 'hotel_rating_mean'].values.reshape(-1, 1)])
X_test = hstack([X_test, df_test.loc[:, 'hotel_rating_mean'].values.reshape(-1, 1)])

In [8]:
y_train = df_train['review_rating']

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=.2, random_state=42)

In [10]:
# XGBoost regression model
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

In [11]:
xg_reg.fit(X_train, y_train)

In [12]:
pred = xg_reg.predict(X_valid)

In [13]:
# Evaluating the model
rmse = mean_squared_error(y_valid, pred, squared=False)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 1.2392319084709522


In [14]:
pred_test = xg_reg.predict(X_test)

In [15]:
submission = pd.DataFrame()
submission['review_id'] = df_test['review_id']
submission['review_rating'] = pred_test

submission.head()    

Unnamed: 0,review_id,review_rating
0,9p62eIN5NkEadRe7,8.031348
1,WfQozacJ1lzXMFQg,9.167491
2,oahzyhSR3ZNHJBuI,9.076857
3,lcO2pUMVQ09RmbHZ,6.949605
4,DmMu1z2SozPxykUS,8.801266


In [16]:
submission.to_csv(r'submission/submission_6.csv', index=False)