<a href="https://colab.research.google.com/github/SadiyaMayat/NLP/blob/main/NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Path to your dataset
file_path = 'IMDB Dataset.csv'

# Load the dataset
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Display basic information about the dataset
print("\nDataset Information:")
print(df.info())

# Display summary statistics of the dataset
print("\nSummary Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

First few rows of the dataset:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None

Summary Statistics:
                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It 

In [None]:
# Initialize the lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords and lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join the words back into a single string
    return ' '.join(words)

# Apply the preprocessing function to the review column
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Convert sentiment column to numerical labels
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Display the first few rows of the cleaned dataset with numerical labels
print("First few rows of the cleaned dataset with numerical labels:")
print(df.head())

First few rows of the cleaned dataset with numerical labels:
                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                      cleaned_review  label  
0  one reviewer mentioned watching 1 oz episode y...      1  
1  wonderful little production filming technique ...      1  
2  thought wonderful way spend time hot summer we...      1  
3  basically there family little boy jake think t...      0  
4  petter matteis love time money visually stunni...      1  


In [None]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to top 5000 features

# Fit and transform the cleaned review texts
tfidf_features = tfidf_vectorizer.fit_transform(df['cleaned_review'])

# Display the shape of the TF-IDF features
print("Shape of TF-IDF features:", tfidf_features.shape)

# Convert TF-IDF features to a DataFrame for inspection
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the first few rows of the TF-IDF features
print("First few rows of TF-IDF features:")
print(tfidf_df.head())

Shape of TF-IDF features: (50000, 5000)
First few rows of TF-IDF features:
         10  100  1000  1010   11  110   12   13  13th   14  ...  youd  \
0  0.000000  0.0   0.0   0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...   0.0   
1  0.000000  0.0   0.0   0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...   0.0   
2  0.000000  0.0   0.0   0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...   0.0   
3  0.079199  0.0   0.0   0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...   0.0   
4  0.000000  0.0   0.0   0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...   0.0   

     youll     young  younger     youre  youth  youve  zero   zombie  zone  
0  0.05819  0.000000      0.0  0.000000    0.0    0.0   0.0  0.00000   0.0  
1  0.00000  0.000000      0.0  0.000000    0.0    0.0   0.0  0.00000   0.0  
2  0.00000  0.080393      0.0  0.000000    0.0    0.0   0.0  0.00000   0.0  
3  0.00000  0.000000      0.0  0.081945    0.0    0.0   0.0  0.11441   0.0  
4  0.00000  0.000000      0.0  0.000000    0.0    0.0   0.0  0.00000   0.0  

[5 rows x 5000 co

In [None]:
# Split the dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, df['label'], test_size=0.3, random_state=42)

# Display the shapes of the training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (35000, 5000)
Shape of X_test: (15000, 5000)
Shape of y_train: (35000,)
Shape of y_test: (15000,)


In [None]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier on the training set
nb_classifier.fit(X_train, y_train)

In [None]:
# Predict the labels on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate a confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      7411
           1       0.85      0.86      0.85      7589

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000

Confusion Matrix:
[[6276 1135]
 [1085 6504]]
