<a href="https://colab.research.google.com/github/ShraddhaSharma24/Natural-Language-Processing/blob/main/Basic_NLP_(Classical_Approach).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Basic NLP (Classical Approach)

Key Concepts:

1.   Text preprocessing: Tokenization, stopword removal, stemming/lemmatization

2.   Feature extraction: Bag of Words, TF-IDF


3.   Traditional models: Naive Bayes, Logistic Regression, SVM

Tools:

NLTK, spaCy, Scikit-learn

In [1]:
pip install nltk scikit-learn pandas matplotlib




In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Load dataset (positive vs negative reviews from newsgroups)
categories = ['rec.autos', 'rec.sport.baseball']
data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

df = pd.DataFrame({'text': data.data, 'label': data.target})
df.head()


Unnamed: 0,text,label
0,\n\n\n\n\nSo after I've flashed my lights at t...,0
1,Giant's have a five man rotation of John Burk...,1
2,"\n\n\n\n\n\nWell, my question still hasn't bee...",1
3,"\nAlso, Alomar got a FAR greater boost from hi...",1
4,\nI thought Bill James' latest book completely...,1


In [3]:
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = re.sub(r'\W', ' ', text)  # remove non-words
    text = text.lower()
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9456066945606695

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.99      0.95       119
           1       0.99      0.90      0.94       120

    accuracy                           0.95       239
   macro avg       0.95      0.95      0.95       239
weighted avg       0.95      0.95      0.95       239



In [5]:
pip install tensorflow_datasets pandas scikit-learn nltk




In [6]:
import tensorflow_datasets as tfds
import pandas as pd

# Load dataset (as supervised = gives (text, label) pairs)
ds_train, ds_test = tfds.load('imdb_reviews', split=['train', 'test'], as_supervised=True)

# Convert to Pandas for easier handling
train_texts, train_labels = [], []
for text, label in tfds.as_numpy(ds_train):
    train_texts.append(text.decode('utf-8'))
    train_labels.append(int(label))

test_texts, test_labels = [], []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(int(label))

df_train = pd.DataFrame({'text': train_texts, 'label': train_labels})
df_test = pd.DataFrame({'text': test_texts, 'label': test_labels})




Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.9XG75P_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.9XG75P_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.9XG75P_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [7]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # remove punctuation and numbers
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df_train['clean_text'] = df_train['text'].apply(clean_text)
df_test['clean_text'] = df_test['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

vectorizer = TfidfVectorizer(max_features=5000)

X_train = vectorizer.fit_transform(df_train['clean_text'])
X_test = vectorizer.transform(df_test['clean_text'])

clf = LogisticRegression(max_iter=200)
clf.fit(X_train, df_train['label'])

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(df_test['label'], y_pred))
print(classification_report(df_test['label'], y_pred))


Accuracy: 0.88144
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

