In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,classification_report
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


def preprocess_text(text):
  
    tokens = word_tokenize(text)
    
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
 
    tokens = [word for word in tokens if word not in string.punctuation]
    
  
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

df = pd.read_csv("preprocessed_dataset.csv")

X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Rating'], test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train_tfidf, y_train)

with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_classifier, f)

y_pred = xgb_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

test_review = "adding some review from data base for testing"
preprocessed_review = preprocess_text(test_review)
test_review_tfidf = tfidf_vectorizer.transform([preprocessed_review])
predicted_rating = xgb_classifier.predict(test_review_tfidf)[0]




Accuracy: 0.748059280169372
Classification Report:
              precision    recall  f1-score   support

           1       0.79      0.47      0.59       256
           2       0.69      0.39      0.50       251
           3       0.64      0.46      0.54       589
           4       0.63      0.40      0.48      1597
           5       0.78      0.95      0.86      4392

    accuracy                           0.75      7085
   macro avg       0.70      0.54      0.59      7085
weighted avg       0.73      0.75      0.72      7085



In [3]:
import pickle

with open('X_train_tfidf.pkl', 'wb') as f:
    pickle.dump(X_train_tfidf, f)
    
with open('X_train_tfidf.pkl', 'rb') as f:
    X_train_tfidf_loaded = pickle.load(f)

print(X_train_tfidf_loaded)


  (0, 17692)	0.3543563017581065
  (0, 9139)	0.35846817606428555
  (0, 18806)	0.5454667872787481
  (0, 21969)	0.6696253894001206
  (1, 10432)	0.24765951463373811
  (1, 19953)	0.08495912508170472
  (1, 3374)	0.19501181619446237
  (1, 2426)	0.0987614389409748
  (1, 1760)	0.08698629841794865
  (1, 10650)	0.30734241785048166
  (1, 16015)	0.1728371642961198
  (1, 21930)	0.07139307438174394
  (1, 6346)	0.13236460456852184
  (1, 20215)	0.05648942238784103
  (1, 13476)	0.15329309258901375
  (1, 9561)	0.08310082670587154
  (1, 15738)	0.27286144592783873
  (1, 11298)	0.13528120342354485
  (1, 10661)	0.15604079163411583
  (1, 8234)	0.10493113958852576
  (1, 17194)	0.19754287223817263
  (1, 9137)	0.21315178964712672
  (1, 14049)	0.2971475606025037
  (1, 19999)	0.17385376710334266
  (1, 8261)	0.060579354529600574
  :	:
  (28335, 13761)	0.31998057575682876
  (28335, 2035)	0.2564392981387361
  (28335, 17583)	0.2003298512268467
  (28335, 3944)	0.21301679305568283
  (28335, 18294)	0.3004160720017759
  (

In [4]:
import pickle

with open('xgb_classifier.pkl', 'wb') as f:
    pickle.dump(xgb_classifier, f)

with open('xgb_classifier.pkl', 'rb') as f:
    xgb_classifier_loaded = pickle.load(f)

print(xgb_classifier_loaded)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

import joblib

joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer_model.pkl')

tfidf_vectorizer_loaded = joblib.load('tfidf_vectorizer_model.pkl')

X_test_tfidf = tfidf_vectorizer_loaded.transform(X_test)
