<a href="https://colab.research.google.com/github/SasukeUchiha-05/NLP/blob/main/NLP_P8_22BD1A6612_MVS_Karthik_15_4_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Problem Statement:

#### 8.	Build a text classifier: Use a dataset with labeled text (e.g., sentiment analysis).
- i.	Implement a Bag-of-Words (BoW) model.
- ii.	Train and evaluate a classifier.

### Step 1: Import all necessary libraries and datasets.

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import movie_reviews

In [None]:
# Download necessary NLTK datasets
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\tanay\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tanay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tanay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Step 2: Load dataset.

In [None]:
# Load IMDB movie reviews dataset from nltk
def load_imdb_dataset():
    pos_reviews = [(movie_reviews.raw(fileid), 1) for fileid in movie_reviews.fileids('pos')]
    neg_reviews = [(movie_reviews.raw(fileid), 0) for fileid in movie_reviews.fileids('neg')]
    dataset = pos_reviews + neg_reviews
    np.random.shuffle(dataset)  # Shuffle dataset
    return dataset

In [None]:
# Load dataset
dataset = load_imdb_dataset()
texts, labels = zip(*dataset)
df = pd.DataFrame({'text': texts, 'label': labels})

### Step 3: Preprocess Data.

#### This includes:
- Lowercasing
- Remove numbers from text(Reviews)
- Punctuation Removal
- Lemmatization (Getting root word)
- Stop Word Removal (Removing unnecessary words).

In [None]:
# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [None]:
# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization
    return " ".join(words)


In [None]:
# Apply preprocessing
df['clean_text'] = df['text'].apply(preprocess_text)

### Step 4: Convert text data to numerical using TF-IDF (Term frequency-Inverse Document Frequency)

In [None]:
# Convert text to numerical representation using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocab size for efficiency
X = vectorizer.fit_transform(df['clean_text'])

### Step 5: Splitting Data and Training Naive Bayes Classifier model.

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.3, random_state=42)

# Train a Naïve Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

### Step 6: Evaluating Model Performance.

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8116666666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.86      0.82       295
           1       0.85      0.77      0.81       305

    accuracy                           0.81       600
   macro avg       0.81      0.81      0.81       600
weighted avg       0.81      0.81      0.81       600

