In [1]:
import nltk
print("NLTK version:", nltk.__version__)

nltk.download("punkt")
nltk.download("movie_reviews")
nltk.download("stopwords")

NLTK version: 3.9.2


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
docs, labels = [], []

for fid in movie_reviews.fileids("pos"):
    docs.append(movie_reviews.raw(fid))
    labels.append(1)

for fid in movie_reviews.fileids("neg"):
    docs.append(movie_reviews.raw(fid))
    labels.append(0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    docs, labels, test_size=0.2, random_state=42, stratify=labels
)

# Model pipeline
model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=20000)),
    ("clf", LogisticRegression(max_iter=2000))
])

# Train
model.fit(X_train, y_train)

# Evaluate
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))

# Test on custom text
samples = [
    "This movie was amazing and very inspiring.",
    "Worst movie ever, completely boring."
]
for s in samples:
    print(s, "->", "positive" if model.predict([s])[0] == 1 else "negative")

Accuracy: 0.8225
This movie was amazing and very inspiring. -> positive
Worst movie ever, completely boring. -> negative


In [4]:
import nltk
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter

def summarize_simple(text, n_sentences=2):
    STOP = set(stopwords.words("english"))
    sentences = sent_tokenize(text)
    words = [w.lower() for w in word_tokenize(text) if w.isalpha()]
    freq = Counter(w for w in words if w not in STOP)

    scored = []
    for i, s in enumerate(sentences):
        ws = [w.lower() for w in word_tokenize(s) if w.isalpha()]
        score = sum(freq.get(w, 0) for w in ws)
        scored.append((score, i, s))

    top = sorted(scored, reverse=True)[:n_sentences]
    return " ".join(s for _, _, s in sorted(top, key=lambda x: x[1]))

text = (
    "Natural language processing helps computers understand human language. "
    "It is used in chatbots, sentiment analysis, translation and spam detection. "
    "Preprocessing like tokenization and cleaning improves accuracy. "
    "Machine learning models can classify and summarize text data."
)

print("Original:\n", text)
print("\nSummary:\n", summarize_simple(text, n_sentences=2))

Original:
 Natural language processing helps computers understand human language. It is used in chatbots, sentiment analysis, translation and spam detection. Preprocessing like tokenization and cleaning improves accuracy. Machine learning models can classify and summarize text data.

Summary:
 Natural language processing helps computers understand human language. Machine learning models can classify and summarize text data.


In [6]:
import os, sys
print("Notebook working dir:", os.getcwd())
print("Python exe:", sys.executable)

Notebook working dir: C:\Users\Lenovo
Python exe: C:\Users\Lenovo\AppData\Local\Python\pythoncore-3.14-64\python.exe
