In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
from google.cloud import aiplatform
from sklearn.pipeline import Pipeline

In [2]:
file_path = 'reviews.csv'
df = pd.read_csv(file_path)
print("✅ Successfully loaded the dataset.")
print("Dataset preview:")
print(df.head())

✅ Successfully loaded the dataset.
Dataset preview:
                                         Review Text  sentiment
0  Absolutely wonderful - silky and sexy and comf...          1
1  Love this dress!  it's sooo pretty.  i happene...          1
2  I love, love, love this jumpsuit. it's fun, fl...          1
3  This shirt is very flattering to all due to th...          1
4  I love tracy reese dresses, but this one is no...         -1


In [3]:
# --- Step 2: Split Data into Training and Testing Sets ---
# It's crucial to test our model on data it has never seen before.
# We'll use 80% of the data for training and 20% for testing.
X = df['Review Text']
y = df['sentiment']

# 'stratify=y' ensures that the proportion of positive and negative reviews is the same in both your training set and your testing set.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nData split into {len(X_train)} training samples and {len(X_test)} testing samples.")


Data split into 15854 training samples and 3964 testing samples.


In [4]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words="english")),
    ('classifier', LogisticRegression(max_iter=2000))
])

print("Training the entire pipeline...")
pipeline.fit(X_train, y_train)
print("Pipeline Training Complete")

Training the entire pipeline...
Pipeline Training Complete


In [6]:
print("\nEvaluating the pipeline on the test data")
prediction = pipeline.predict(X_test)

score = accuracy_score(y_test, prediction)
print(f"Model Accuracy on the test set: {score:.4f}")


Evaluating the pipeline on the test data
Model Accuracy on the test set: 0.9294


In [7]:
print("Saving the pipeline to model.joblib...")
joblib.dump(pipeline, 'model.joblib')

Saving the pipeline to model.joblib...


['model.joblib']