# 1. Import libraries

In [10]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [11]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True



*  **pandas**: A library for data manipulation and analysis.
*   **nltk**: Netural Language Toolkit, used for text processing
*  **sklearn.model_selection.train_test_split**: Splits the data into training and testing sets.
* **sklearn.feature_extraction.text.CountVectorizer**: Converts text data into a matrix of token counts.
* **sklearn.naive_bayes.MultinomialINB**: Naive bayes classifier for multinomial model.
* **sklearn.metrics**: Function to measure the accuracy and performance of the model.
* **nltk.download**: Download necessary dataset for NLTK, such as tokenizers and stop words.


# 2. Load and Preprocess Data

In [12]:
# from google.colab import files
# files.upload()

In [13]:
from google.colab import drive
drive.mount("/content/drive")
#df = pd.read_csv('/content/drive/MyDrive/DataSets/Movie.xlsx')
#df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/DataSets/Movie.csv')
df.head()

Unnamed: 0,review,sentiment
0,I like movie,Positive
1,I hate movie,Negative
2,A great movie. I like movie,Positive
3,Poor Acting,Negative
4,Great Acting,Positive


In [15]:
# Text preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum()]
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['review'] = df['review'].apply(preprocess_text)

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


* **Sample data**: Creates a simple dataset with text and corresponding sentimental labels.
* **DataFrame**: Converts the sample data into a pandas Dataframe.
* **Preprocessing**:
    * **stopwords.word('english')**: Load a list common stopwords in English.
    * **word_tokenize(text)**: Tokenizes the text into words.
    * The preprocessing functiion 'preprocess_text':
      * Tokenizes the text
      * Removes non-alphanumeric tokens.
      * Removes stopwords.
      * Joins the remaining words back into single string.

# 3. Split the data

In [17]:
from re import X
x  = df['review']
y = df['sentiment']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

* **x**: The feature set(review data).
* **y**: The target set(sentiment labels).
* **train_test_split**: Splits the data into training and testing sets(80% training, 20% testing).

# 4. Feature Extraction

In [18]:
vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

* **CountVectorizer**: Converts the text data into a matrix of token counts.
  * **fit_transform(x_train)**: Fits the vectorizer to the training data and transforms it into a matrix of token counts.
  * **transform(x_test)**: Transform the test data into the same matrix of token counts using the already fitted vectorizer.

# 5. Train a model

In [19]:
model = MultinomialNB()
model.fit(x_train_vectorized, y_train)

* **MultinomiaINB**: Initializes the Naive Bayes Classifier.
* **fit**: Trains the model on the vectorized training data and corresponding labels.

# 6. Evaluate the model

In [20]:
predictions = model.predict(x_test_vectorized)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("Classification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.0
Confusion Matrix:
[[0 1]
 [0 0]]
Classification Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       1.0
    Positive       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


* **predict**: Makes predictions on the vectorized text data.
* **accuracy_score**:
Calculates the accuracy of the predictions.
* **confusion_matrix": Generates a confusion matrix to show the performance of the model.
* **classification_report**: Provides a detailed classifications report, including precision, recall and f1-score.

# 7. Make Predictions

In [21]:
def predict_sentiment(text):
    text = preprocess_text(text)
    text_vectorized = vectorizer.transform([text])
    prediction = model.predict(text_vectorized)
    return prediction[0]



The sentiment of the sample text is: Positive


In [22]:
sample_text = "I enjoyed this movie a lot"
print(f"The sentiment of the sample text is: {predict_sentiment(sample_text)}")

The sentiment of the sample text is: Positive


* **predict_sentiment**: A function that takes a text input, preprocesses it, vectorized it, and predicts its sentiment using the trained model.
* **sample_text**: A sample text to demonstrate the sentiment prediction.
* **print**: Outputs the predicted sentiment for the sample text.






##### This project provides a basic sentiment analysis tool using Naive Bayes classification and text preprocessing with NLTK and scikit-learn. It covers loading data, preprocessing, feature extraction, model training, evaluation, and making predictions.

In [23]:
!git config --global user.email "sunainaupadhyay79@gmail.com"
!git config --global user.name "Sunaina"