In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import warnings  
warnings.filterwarnings('ignore')

In [5]:
df=pd.read_csv("sentiment_analysis_results.csv")

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,helpful,rating,review_date,site,text,verified_buyer,verified_reviewer,city,state,cleaned_text,sentiment,aspects
0,0,2.0,5.0,2019-05-26,BookIt.com,Just saved $600. on BookIt vs. another web sit...,VerifiedBuyer,VerifiedReviewer,Stuart of Springfield,IL,just saved 600 on bookit vs another web site h...,positive,general
1,1,8.0,5.0,2019-03-28,BookIt.com,I had a great experience with BookIt. I needed...,VerifiedBuyer,VerifiedReviewer,Nichole of Roseville,CA,i had a great experience with bookit i needed ...,positive,service
2,2,7.0,5.0,2019-02-10,BookIt.com,I have had nothing but courteous patient and w...,VerifiedBuyer,VerifiedReviewer,Cheryl of Corfu,NY,i have had nothing but courteous patient and w...,positive,service
3,3,4.0,5.0,2019-05-22,BookIt.com,I just want to let you know that I had talked ...,VerifiedBuyer,VerifiedReviewer,Lisa of Flushing,MI,i just want to let you know that i had talked ...,positive,general
4,4,4.0,4.0,2019-04-13,BookIt.com,I called to verify one of the All Inclusive Fe...,VerifiedBuyer,VerifiedReviewer,Brandon of Honolulu,HI,i called to verify one of the all inclusive fe...,negative,"service, amenities"


In [27]:
df['cleaned_text'] = df['cleaned_text'].fillna("")
df['cleaned_text'].isna().sum()

0

In [9]:
df=df.drop(columns='text')

In [11]:
label_encoder = LabelEncoder()

In [13]:
df['sentiment_label'] = label_encoder.fit_transform(df['sentiment'])


In [15]:
x = df['cleaned_text']
y = df['sentiment_label']

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [35]:
x_train.isna().sum()
x_train = x_train.fillna("")

In [37]:
x_test.isna().sum()
x_test = x_test.fillna("")

In [39]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Use unigrams and bigrams


In [41]:
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

## Naive Bayes Model

In [44]:
nb = MultinomialNB()
nb.fit(x_train_tfidf, y_train)

In [56]:
y_pred = nb.predict(x_test_tfidf)

In [58]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1-Score:", f1_score(y_test, y_pred, average='weighted'))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.7229920814479638
Precision: 0.706747752188032
Recall: 0.7229920814479638
F1-Score: 0.6986720353679579

Confusion Matrix:
[[3005    0  278]
 [ 197    0  176]
 [1308    0 2108]]

Classification Report:

              precision    recall  f1-score   support

    negative       0.67      0.92      0.77      3283
     neutral       0.00      0.00      0.00       373
    positive       0.82      0.62      0.71      3416

    accuracy                           0.72      7072
   macro avg       0.50      0.51      0.49      7072
weighted avg       0.71      0.72      0.70      7072



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## XGBoost Model

In [67]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(x_train_tfidf, y_train)

In [68]:
y_pred_xgb=xgb.predict(x_test_tfidf)

In [69]:
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Precision:", precision_score(y_test, y_pred_xgb, average='weighted'))
print("Recall:", recall_score(y_test, y_pred_xgb, average='weighted'))
print("F1-Score:", f1_score(y_test, y_pred_xgb, average='weighted'))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))

Accuracy: 0.806843891402715
Precision: 0.8116975447825083
Recall: 0.806843891402715
F1-Score: 0.8014022942097163

Confusion Matrix:
[[2874   12  397]
 [ 154  125   94]
 [ 699   10 2707]]

Classification Report:

              precision    recall  f1-score   support

    negative       0.77      0.88      0.82      3283
     neutral       0.85      0.34      0.48       373
    positive       0.85      0.79      0.82      3416

    accuracy                           0.81      7072
   macro avg       0.82      0.67      0.71      7072
weighted avg       0.81      0.81      0.80      7072

