In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
df_train = pd.read_csv("./dataset/train_preprocessed_v2.csv")
df_test = pd.read_csv("./dataset/test_preprocessed_v2.csv")

df = pd.concat([df_train, df_test], axis=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 556871 entries, 0 to 278435
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0.1     556871 non-null  int64  
 1   Unnamed: 0       556871 non-null  int64  
 2   hotel_name       556871 non-null  object 
 3   hotel_location   540465 non-null  object 
 4   review_id        556871 non-null  object 
 5   review_date      556593 non-null  object 
 6   review_text      556871 non-null  object 
 7   review_language  556871 non-null  object 
 8   review_rating    278435 non-null  float64
dtypes: float64(1), int64(2), object(6)
memory usage: 42.5+ MB


In [4]:
vectorizer = TfidfVectorizer()

In [5]:
X = vectorizer.fit_transform(df['review_text'])

In [6]:
from scipy.sparse import csr_matrix

# Assuming your_sparse_matrix is the sparse matrix you have
# Splitting the matrix into two parts based on row indices
X_train = X[:278435]
X_test = X[278435:]

In [7]:
y_train = df_train['review_rating']

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=.2, random_state=42)

In [9]:
# XGBoost regression model
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

In [10]:
xg_reg.fit(X_train, y_train)

In [11]:
pred = xg_reg.predict(X_valid)

In [12]:
# Evaluating the model
rmse = mean_squared_error(y_valid, pred, squared=False)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 1.2917907226126313


In [13]:
pred_test = xg_reg.predict(X_test)

In [14]:
submission = pd.DataFrame()
submission['review_id'] = df_test['review_id']
submission['review_rating'] = pred_test

submission.head()    

Unnamed: 0,review_id,review_rating
0,9p62eIN5NkEadRe7,7.021558
1,WfQozacJ1lzXMFQg,9.329128
2,oahzyhSR3ZNHJBuI,9.444663
3,lcO2pUMVQ09RmbHZ,6.981236
4,DmMu1z2SozPxykUS,8.561797


In [15]:
submission.to_csv(r'submission/submission_2.csv', index=False)