In [101]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [102]:
# Read the dataset
filePath = './review_sentiments_dataset.csv'
df = pd.read_csv(filePath)

# Drop unnecessary columns
df = df.drop(['category', 'rating'], axis=1)

# Drop duplicates
df = df.drop_duplicates()

# Convert text to lowercase
df['processed_text'] = df['text_'].apply(lambda x: x.lower())

In [103]:
# Step 1: TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['processed_text']).toarray()



In [104]:
# chi2_selector = SelectKBest(chi2, k=2000)
# X_tfidf = chi2_selector.fit_transform(X_tfidf, y)
# print(X_tfidf)

In [105]:
# Step 2: Scale the Numeric Features
numeric_features = df[['neg', 'neu', 'pos', 'compound']]
scaler = StandardScaler()
numeric_features_scaled = scaler.fit_transform(numeric_features)

# Step 3: Concatenate TF-IDF Features with Scaled Numeric Features
X_combined = np.hstack([X_tfidf, numeric_features_scaled])

In [106]:
# Map labels to numerical values
df['label'] = df['label'].map({'OR': 0, 'CG': 1})
y = df['label']

In [107]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [108]:
from xgboost import XGBClassifier

In [109]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

In [110]:
# Save the trained TF-IDF vectorizer to a file
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

In [111]:
# Save the trained model to a file
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
print(f"TF-IDF vectorizer saved as tfidf_vectorizer.pkl")
print(f"Model saved as {filename}")

TF-IDF vectorizer saved as tfidf_vectorizer.pkl
Model saved as finalized_model.sav


In [112]:
# Make predictions on the test set
predictions = model.predict(X_test)

In [113]:
# Print classification report and accuracy score
print("Evaluation on Test Set:")
print(classification_report(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions)}")

Evaluation on Test Set:
              precision    recall  f1-score   support

           0       0.89      0.90      0.89      4071
           1       0.90      0.88      0.89      4016

    accuracy                           0.89      8087
   macro avg       0.89      0.89      0.89      8087
weighted avg       0.89      0.89      0.89      8087

Accuracy: 0.8908124149870162


In [114]:
# from sklearn.model_selection import cross_validate
# from sklearn.metrics import recall_score
# from sklearn import svm

# scoring = ['precision_macro', 'recall_macro']
# clf = svm.SVC(kernel='linear', C=1, random_state=0)
# scores = cross_validate(clf, X_combined, y, scoring=scoring)
# sorted(scores.keys())
# scores['test_recall_macro']

In [115]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [116]:
chi2_selector = SelectKBest(chi2, k=1000)
X_kbest = chi2_selector.fit_transform(X_tfidf, y)
print(X_kbest)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [117]:
print('Original number of features:', X_tfidf.shape)
print('Reduced number of features:', X_kbest.shape)

Original number of features: (40432, 5000)
Reduced number of features: (40432, 1000)
