# Problem Statement 

Sentiment analysis can help improve the performance of the recommendation system. Recommendation algorithm alone predicts the items based on user's past behaviour. However the recommend items might not be liked by the other users. By using sentiment analysis we can recommend the product based on how it's been percieved by other users. 

This notebook focuses on building a sentiment prediction model using various Machine Learning Algorithms.

In [14]:
import importlib.util
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split
import pickle

In [5]:
def module_from_file(module_name, file_path):
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module

In [11]:
models = module_from_file("models","models.py")

## Training the model using BOW Representation


In [5]:
#X=joblib.load('bow_features')
#y= joblib.load('target')
#X_transformed = X.toarray()
#y= y.astype(int)

In [12]:
with open('bow_model_.pkl','rb') as f:
    bow_model = pickle.load(f)

In [38]:
df = pd.read_csv("data/pre_process_data.csv")
df.head()

Unnamed: 0,lemmatized_review,user_sentiment
0,love album good hip hop current pop sound hype...,1
1,good flavor review collect promotion,1
2,good flavor,1
3,read review look buy couple lubricant ultimate...,0
4,husband buy gel gel cause irritation feel like...,0


In [40]:
df.dropna(inplace=True)

In [41]:
X=df['lemmatized_review']
y=df['user_sentiment']

In [50]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=42)

In [51]:
# transforming X_train to bow representation
X_train=bow_model.fit_transform(X_train).toarray()
X_test=bow_model.transform(X_test).toarray()

### Naive-Bayes without hyper parameter tuning

In [52]:
#training using naive bayes
nb = models.NaiveBayes()
naive_bayes,metrics=nb.train_model_without_hp(X_train,y_train,X_test,y_test)

2023-02-08 23:12:00,834 - root - INFO - Training the model without hyperparameter tuning
2023-02-08 23:12:18,979 - root - INFO - Finished training at time.struct_time(tm_year=2023, tm_mon=2, tm_mday=8, tm_hour=17, tm_min=42, tm_sec=18, tm_wday=2, tm_yday=39, tm_isdst=0)


In [53]:
model_performance={}
model_performance['naive_bayes_bow_without_hp']=metrics

### Naive-bayes using hyperparameter tuning

In [54]:
naive_bayes_hp,metrics=nb.train_model_with_hp(X_train,y_train,X_test,y_test)

2023-02-08 23:17:14,513 - root - INFO - Started training naive bayes with hyperparameter tuning


ValueError: 
All the 20 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\naive_bayes.py", line 749, in fit
    X, y = self._check_X_y(X, y)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\naive_bayes.py", line 1203, in _check_X_y
    X = binarize(X, threshold=self.binarize)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\preprocessing\_data.py", line 2035, in binarize
    cond = X > threshold
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 211. MiB for an array with shape (16796, 13171) and data type bool

--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\naive_bayes.py", line 749, in fit
    X, y = self._check_X_y(X, y)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\naive_bayes.py", line 1203, in _check_X_y
    X = binarize(X, threshold=self.binarize)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\preprocessing\_data.py", line 2035, in binarize
    cond = X > threshold
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 211. MiB for an array with shape (16797, 13171) and data type bool

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\naive_bayes.py", line 749, in fit
    X, y = self._check_X_y(X, y)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\naive_bayes.py", line 1203, in _check_X_y
    X = binarize(X, threshold=self.binarize)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\preprocessing\_data.py", line 2025, in binarize
    X = check_array(X, accept_sparse=["csr", "csc"], copy=copy)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\utils\validation.py", line 950, in check_array
    array = _asarray_with_order(
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\utils\_array_api.py", line 186, in _asarray_with_order
    return xp.asarray(array, copy=copy)
  File "C:\Users\Rakshu\flask\lib\site-packages\sklearn\utils\_array_api.py", line 73, in asarray
    return numpy.array(x, copy=True, dtype=dtype)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 1.65 GiB for an array with shape (16797, 13171) and data type float64


In [11]:
model_performance['naive_bayes_bow_with_hp']=metrics

### Logistic Regression without hyper-parameter tuning

In [10]:
# training the model using logistic regression
lr = LRClassification()
lr_model,metrics = lr.train_model_without_hp(X_train,y_train,X_test,y_test)

2023-01-24 19:07:25,048 - root - INFO - Training the model without hyperparameter tuning
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
2023-01-24 19:08:01,012 - root - INFO - Finished training at time.struct_time(tm_year=2023, tm_mon=1, tm_mday=24, tm_hour=19, tm_min=8, tm_sec=1, tm_wday=1, tm_yday=24, tm_isdst=0)


In [12]:
lr_model.get_params

<bound method BaseEstimator.get_params of LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)>

In [13]:
model_performance['lr_bow_without_hp']=metrics

### Logistic regression with hyper parameter tuning 

Training logistic regression with hyper-parameter tuning using randomSearchCV on a smaller dataset

In [None]:
# tuning the model with smaller set

In [13]:
lr_model_hp,metrics = lr.train_model_with_hp(X_train[0:10000],y_train[0:10000],X_test[0:100],y_test[0:100])

2023-01-24 19:13:42,820 - root - INFO - Started training logistic regression with hyperparameter tuning
2023-01-24 19:21:25,115 - root - INFO - Best params {'tol': 0.01, 'C': 1} 
2023-01-24 19:21:42,178 - root - INFO - Finished training at time.struct_time(tm_year=2023, tm_mon=1, tm_mday=24, tm_hour=19, tm_min=21, tm_sec=42, tm_wday=1, tm_yday=24, tm_isdst=0)


In [14]:
model_performance['lr_bow_with_hp']=metrics

In [15]:
model_performance

{'naive_bayes_bow_without_hp': {'training_accuracy': 0.8917567503214439,
  'training_precision': 0.911524500907441,
  'training_recall': 0.9721490402709823,
  'test_accuracy': 0.8728888888888889,
  'test_precision': 0.9076813824121198,
  'test_recall': 0.954686916469563},
 'naive_bayes_bow_with_hp': {'training_accuracy': 0.963141101957236,
  'training_precision': 0.9991598991879026,
  'training_recall': 0.959191354373891,
  'test_accuracy': 0.9263333333333333,
  'test_precision': 0.9846133613887428,
  'test_recall': 0.9320303747043446},
 'lr_bow_with_hp': {'training_accuracy': 0.9675,
  'training_precision': 0.9994130766521893,
  'training_recall': 0.9637763187683949,
  'test_accuracy': 0.9,
  'test_precision': 0.9655172413793104,
  'test_recall': 0.9230769230769231}}

### XGboost 

In [16]:
xgb = XGBoost()
xgb,metrics=xgb.train_model_without_hp(X_train,y_train,X_test,y_test)

2023-01-24 19:23:59,636 - root - INFO - Training the model without hyperparameter tuning
2023-01-24 19:32:00,792 - root - INFO - Finished training at time.struct_time(tm_year=2023, tm_mon=1, tm_mday=24, tm_hour=19, tm_min=32, tm_sec=0, tm_wday=1, tm_yday=24, tm_isdst=0)


In [17]:
model_performance['xgb_without_hp'] = metrics

### Evaluating performance of different ML algorithms trained on BOW model

In [20]:
import pandas as pd
bow_performance=pd.DataFrame(model_performance)
bow_performance

Unnamed: 0,naive_bayes_bow_without_hp,naive_bayes_bow_with_hp,lr_bow_with_hp,xgb_without_hp
training_accuracy,0.891757,0.963141,0.9675,0.95495
training_precision,0.911525,0.99916,0.999413,0.971023
training_recall,0.972149,0.959191,0.963776,0.978332
test_accuracy,0.872889,0.926333,0.9,0.937556
test_precision,0.907681,0.984613,0.965517,0.959528
test_recall,0.954687,0.93203,0.923077,0.970995


## Training the model using TF-IDF

In [21]:
X=joblib.load('tf_idf_features')
#y= joblib.load('target')

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=42)

### Naive Bayes 

In [23]:
#training using naive bayes
nb = NaiveBayes()
naive_bayes,metrics=nb.train_model_without_hp(X_train,y_train,X_test,y_test)

2023-01-24 19:35:30,070 - root - INFO - Training the model without hyperparameter tuning
2023-01-24 19:35:37,090 - root - INFO - Finished training at time.struct_time(tm_year=2023, tm_mon=1, tm_mday=24, tm_hour=19, tm_min=35, tm_sec=37, tm_wday=1, tm_yday=24, tm_isdst=0)


In [29]:
model_performance={}
model_performance['naive_bayes_without_hp'] = metrics

In [30]:
naive_bayes_hp,metrics=nb.train_model_with_hp(X_train,y_train,X_test,y_test)

2023-01-24 19:44:12,575 - root - INFO - Started training naive bayes with hyperparameter tuning
2023-01-24 19:45:33,938 - root - INFO - Best params {'alpha': 1e-07} 
2023-01-24 19:45:41,024 - root - INFO - Finished training at time.struct_time(tm_year=2023, tm_mon=1, tm_mday=24, tm_hour=19, tm_min=45, tm_sec=41, tm_wday=1, tm_yday=24, tm_isdst=0)


In [31]:
model_performance['naive_bayes_bow_with_hp']=metrics

### Logistic Regression

In [32]:
# training the model using logistic regression
lr = LRClassification()
lr_model,metrics = lr.train_model_without_hp(X_train,y_train,X_test,y_test)

2023-01-24 19:48:17,707 - root - INFO - Training the model without hyperparameter tuning
2023-01-24 19:48:50,442 - root - INFO - Finished training at time.struct_time(tm_year=2023, tm_mon=1, tm_mday=24, tm_hour=19, tm_min=48, tm_sec=50, tm_wday=1, tm_yday=24, tm_isdst=0)


In [33]:
model_performance['lr_bow_without_hp']=metrics

In [34]:
lr_model_hp,metrics = lr.train_model_with_hp(X_train[0:10000],y_train[0:10000],X_test[0:100],y_test[0:100])

2023-01-24 19:50:39,685 - root - INFO - Started training logistic regression with hyperparameter tuning
2023-01-24 19:56:22,838 - root - INFO - Best params {'tol': 0.01, 'C': 1} 
2023-01-24 19:56:37,276 - root - INFO - Finished training at time.struct_time(tm_year=2023, tm_mon=1, tm_mday=24, tm_hour=19, tm_min=56, tm_sec=37, tm_wday=1, tm_yday=24, tm_isdst=0)


In [35]:
model_performance['lr_bow_with_hp']=metrics

### XGBoost

In [36]:
xgbc = XGBoost()
xgb,metrics=xgbc.train_model_without_hp(X_train,y_train,X_test,y_test)

2023-01-24 19:56:50,450 - root - INFO - Training the model without hyperparameter tuning
2023-01-24 20:05:06,424 - root - INFO - Finished training at time.struct_time(tm_year=2023, tm_mon=1, tm_mday=24, tm_hour=20, tm_min=5, tm_sec=6, tm_wday=1, tm_yday=24, tm_isdst=0)


In [37]:
model_performance['xgb_without_hp'] = metrics

In [38]:
model_performance

{'naive_bayes_without_hp': {'training_accuracy': 0.922805847897519,
  'training_precision': 0.998122286116653,
  'training_recall': 0.9145652992096349,
  'test_accuracy': 0.9026666666666666,
  'test_precision': 0.9887993443518645,
  'test_recall': 0.901157724386904},
 'naive_bayes_bow_with_hp': {'training_accuracy': 0.9521405781227678,
  'training_precision': 0.9705787953354017,
  'training_recall': 0.9755363191569439,
  'test_accuracy': 0.8918888888888888,
  'test_precision': 0.9279825412221144,
  'test_recall': 0.9528196190713307},
 'lr_bow_without_hp': {'training_accuracy': 0.922805847897519,
  'training_precision': 0.998122286116653,
  'training_recall': 0.9145652992096349,
  'test_accuracy': 0.9026666666666666,
  'test_precision': 0.9887993443518645,
  'test_recall': 0.901157724386904},
 'lr_bow_with_hp': {'training_accuracy': 0.9234,
  'training_precision': 0.9985170538803757,
  'training_recall': 0.9146479510980303,
  'test_accuracy': 0.93,
  'test_precision': 1.0,
  'test_recal

In [42]:
# xbgoost without hyperparameter
import pickle
with open('model.pkl','wb') as f:
    pickle.dump(xgb,f)

In [39]:
tf_idf_performance=pd.DataFrame(model_performance)

## Evaluating the performance

In [40]:
tf_idf_performance

Unnamed: 0,naive_bayes_without_hp,naive_bayes_bow_with_hp,lr_bow_without_hp,lr_bow_with_hp,xgb_without_hp
training_accuracy,0.922806,0.952141,0.922806,0.9234,0.978475
training_precision,0.998122,0.970579,0.998122,0.998517,0.986228
training_recall,0.914565,0.975536,0.914565,0.914648,0.989516
test_accuracy,0.902667,0.891889,0.902667,0.93,0.953444
test_precision,0.988799,0.927983,0.988799,1.0,0.96643
test_recall,0.901158,0.95282,0.901158,0.923077,0.981949
