In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from pathlib import Path

In [3]:
# NOTE IF YOU WANT TO RUN YOU NEED TO GET THE FILE FROM GOOGLE DRIVE
data = Path('../data/vader_emolex.csv')
df = pd.read_csv(data)

In [8]:
reviews = df[['text', 'stars']]

In [9]:
# reviews['sentiment'] = reviews['stars'].apply(lambda x: 0 if x <= 2 else 1 if x == 3 else 2)
reviews['sentiment'] = reviews['stars'].apply(lambda x: 0 if x <= 1 else 1 if x == 2 or x == 3 else 2)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(reviews['text'], reviews['sentiment'], test_size=0.2)

# Create a Count Vectorizer to transform the text data into numerical features
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train a logistic regression model on the training data
model = LogisticRegression()
model.fit(X_train_vect, y_train)

# Evaluate the model on the test data
score = model.score(X_test_vect, y_test)
print(f'Test accuracy: {score:.2f}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['sentiment'] = reviews['stars'].apply(lambda x: 0 if x <= 1 else 1 if x == 2 or x == 3 else 2)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test accuracy: 0.84


In [10]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_vect)
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

              precision    recall  f1-score   support

    negative       0.76      0.75      0.76     25876
     neutral       0.71      0.61      0.66     41405
    positive       0.90      0.95      0.92    110442

    accuracy                           0.84    177723
   macro avg       0.79      0.77      0.78    177723
weighted avg       0.83      0.84      0.84    177723



In [10]:
from joblib import dump
dump(model, 'model_LR.joblib')
dump(vectorizer, 'vectorizer_LR.joblib')

['vectorizer_LR.joblib']

In [12]:
test = Path('../data/subset_20000.csv')
df_test = pd.read_csv(test)
dftemp = df_test[df_test['stars'] == 3]
for index, row in dftemp.iterrows():
    review_id = row['review_id']
    sample_text = row['text']
    sample_text_vect = vectorizer.transform([sample_text])
    sentiment_pred = model.predict(sample_text_vect)[0]
    sentiment_label = ['negative', 'neutral', 'positive'][sentiment_pred]
    print(f'Sentiment prediction for sample text: {sentiment_label} and {review_id}')

Sentiment prediction for sample text: negative and ptpHomDWWvwu_mMEYKu5Ig
Sentiment prediction for sample text: negative and 4HPoV8Ozg77zjdZm6Mf7fg
Sentiment prediction for sample text: negative and 5Ms2MDGy496km1ZUtrqpKA
Sentiment prediction for sample text: neutral and wEKJ683lQ-lTZMdMGzOe6A
Sentiment prediction for sample text: negative and YIXh5GB297HGItFLIpwT-A
Sentiment prediction for sample text: neutral and 3U8eD-0vFHsbTW2KI0suQw
Sentiment prediction for sample text: negative and BnysID2CjgGqvNLgETlowQ
Sentiment prediction for sample text: negative and ODRqiwaNoCMtFFPfkwnk-w
Sentiment prediction for sample text: negative and SOtgU3WuMfT_GOogoahbtA
Sentiment prediction for sample text: negative and IeA0s96STwbx_aXECFHGCQ
Sentiment prediction for sample text: neutral and 9_SxgAwzh7OuBnzrwRtteg
Sentiment prediction for sample text: neutral and a6t0gmyIi5PTDptyCtW8Gg
Sentiment prediction for sample text: negative and anM9ImNnq9uSbnrn9SBDfQ
Sentiment prediction for sample text: nega