# **Install Required** **Libraries**

In [8]:
!pip install nltk scikit-learn pandas



In [9]:
from sklearn.datasets import load_files
import os

# Download the dataset if it doesn't exist
if not os.path.exists('aclImdb'):
    !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    !tar -xzf aclImdb_v1.tar.gz

# Load the dataset
reviews = load_files("aclImdb/train/", categories=['pos', 'neg'], encoding='utf-8')
X, y = reviews.data, reviews.target  # Assign data and target to X and y

--2025-04-04 17:20:09--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2025-04-04 17:20:25 (5.14 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



# **1.** **Text** **Preprocessing**

In [12]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the missing 'punkt_tab' data package

# Load dataset
from sklearn.datasets import load_files
reviews = load_files("aclImdb/train/", categories=['pos', 'neg'], encoding='utf-8')
X, y = reviews.data, reviews.target

# Preprocessing function
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()

    cleaned = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word.isalpha() and word not in stop_words
    ]

    return " ".join(cleaned)

# Apply preprocessing
X_cleaned = [preprocess_text(doc) for doc in X]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# **2. Feature Engineering**

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X_cleaned)


#  **3. Model Training**

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)


# **4. Model Evaluation**

In [15]:
from sklearn.metrics import classification_report

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))


              precision    recall  f1-score   support

    Negative       0.88      0.86      0.87      2482
    Positive       0.86      0.89      0.88      2518

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



# **Optional: Predict Custom Input**

In [16]:
def predict_sentiment(text):
    cleaned_text = preprocess_text(text)
    vectorized = vectorizer.transform([cleaned_text])
    prediction = model.predict(vectorized)
    return "Positive" if prediction[0] == 1 else "Negative"

# Test it
print(predict_sentiment("This movie was absolutely amazing!"))


Positive
