In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from pathlib import Path

In [5]:
# NOTE IF YOU WANT TO RUN YOU NEED TO GET THE FILE FROM GOOGLE DRIVE
data = Path('../data/large_files/vader_emolex.csv')
df = pd.read_csv(data)

In [6]:
reviews = df[['text', 'stars']]

In [7]:
reviews['sentiment'] = reviews['stars'].apply(lambda x: 0 if x <= 1 else 1 if x == 2 or x == 3 else 2)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(reviews['text'], reviews['sentiment'], test_size=0.2)

# Create a Count Vectorizer to transform the text data into numerical features
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train a logistic regression model on the training data
model = LogisticRegression()
model.fit(X_train_vect, y_train)

# Evaluate the model on the test data
score = model.score(X_test_vect, y_test)
print(f'Test accuracy: {score:.2f}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['sentiment'] = reviews['stars'].apply(lambda x: 0 if x <= 1 else 1 if x == 2 or x == 3 else 2)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test accuracy: 0.84


In [8]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_vect)
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

              precision    recall  f1-score   support

    negative       0.77      0.74      0.75     25702
     neutral       0.71      0.61      0.66     41501
    positive       0.90      0.95      0.92    110520

    accuracy                           0.84    177723
   macro avg       0.79      0.77      0.78    177723
weighted avg       0.83      0.84      0.84    177723



In [10]:
from joblib import dump
model = Path('../review_program/static/joblib/model_LR.joblib')
vectorizer = Path('../review_program/static/joblib/vectorizer_LR.joblib')
dump(model, model)
dump(vectorizer, vectorizer)

['../review_program/static/joblib/vectorizer_LR2.joblib']

THE BELOW WAS USED FOR SOME ANALYTICS OF REVIEWS RUN THROUGH THE LOGISTIC REGRESSION MODEL

In [10]:
test = Path('../data/subset_2000.csv')
df_test = pd.read_csv(test)
dftemp = df_test[df_test['stars'] == 3]
for index, row in dftemp.iterrows():
    review_id = row['review_id']
    sample_text = row['text']
    sample_text_vect = vectorizer.transform([sample_text])
    sentiment_pred = model.predict(sample_text_vect)[0]
    sentiment_label = ['negative', 'neutral', 'positive'][sentiment_pred]
    print(f'Sentiment prediction for sample text: {sentiment_label} ....... {sample_text}')

Sentiment prediction for sample text: neutral ....... The food is decent but the dishes have been dirty every time I've gone. The service isn't great and the experience seems very rushed, despite the food taking a long time and the waiter rarely checking up on us
Sentiment prediction for sample text: neutral ....... I thought the food was tasty but they used a lot of food coloring which I never like or want and the poor I bread was not completely cooked and was raw and full of oil which I feel is a sloppy mistake. There are a lot of better Indian options for better quality so I'm not sure I will return.
Sentiment prediction for sample text: neutral ....... I took my wife for dinner on Valentine's Day and I was thinking that it would be packed. The place was completely empty. Only 1 other couple in the restaurant.

I did not care too much for the server. She was ok, but not impressed. 

The food was good, but I don't want to eat in an empty restaurant.
Sentiment prediction for sample te