# Sentiment Analysis

This notebook demonstrates the steps to build a sentiment analysis model.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

## Load Data

In [2]:
# Load raw data
df = pd.read_csv('../data/raw/reviews.csv')
df.head()

## Preprocess Data

In [3]:
import re
import string

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = text.strip()
    return text

df['review'] = df['review'].apply(preprocess_text)
df.head()

## Split Data

In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)
X_train.shape, X_test.shape

## Vectorize Text Data

In [5]:
# Use TF-IDF to vectorize the text data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_train_tfidf.shape, X_test_tfidf.shape

## Train Model

In [6]:
# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

## Evaluate Model

In [7]:
# Evaluate the model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(report)
accuracy

'0.90'