# 🧪 NLP Preprocessing + Classification (Logistic Regression)

This notebook covers:
- Text preprocessing (cleaning, tokenizing, lemmatizing, etc.)
- TF-IDF vectorization
- Sentiment classification using Logistic Regression

In [None]:
# ✅ Step 1: Install & Import Libraries
!pip install nltk scikit-learn textblob pandas --quiet

import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# ✅ Step 2: Sample Dataset
data = {
    "review": [
        "I loved the movie, it was amazing!",
        "Terrible movie, I hated it.",
        "It was an okay movie, not bad.",
        "Absolutely fantastic acting and story.",
        "Waste of time, very boring.",
        "Best movie I've seen in years!",
        "Not worth watching, very poor."
    ],
    "label": [1, 0, 1, 1, 0, 1, 0]  # 1 = positive, 0 = negative
}

df = pd.DataFrame(data)
df

In [None]:
# ✅ Step 3: Preprocessing Function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [None]:
# ✅ Step 4: Apply Preprocessing
df['clean_review'] = df['review'].apply(preprocess)
df[['review', 'clean_review', 'label']]

In [None]:
# ✅ Step 5: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_review'], df['label'], test_size=0.3, random_state=42
)

In [None]:
# ✅ Step 6: Vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# ✅ Step 7: Train Classifier
model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [None]:
# ✅ Step 8: Evaluate
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))