In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and non-alphabetic tokens, and lemmatize each token
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalpha()]
    
    # Get the POS tags for each token
    pos_tags = nltk.pos_tag(tokens)
    pos_tags = [tag for _, tag in pos_tags]
    
    # Concatenate the tokens and POS tags into a single string
    return ' '.join(tokens + pos_tags)


In [4]:
data = pd.read_csv('/workspaces/codespaces-jupyter/data/IMDB Dataset.csv')

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
data['review'] = data['review'].str.replace('[^\w\s]','', regex=True).str.lower()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)

In [8]:
# Extract features using bag of words model
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [9]:
# Train the Naive Bayes model
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

In [10]:
# Test the Naive Bayes model
y_pred = clf.predict(X_test_vec)

In [11]:
# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='positive')
recall = recall_score(y_test, y_pred, pos_label='positive')

In [12]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

Accuracy:  0.8496
Precision:  0.8731264513405109
Recall:  0.8207977773367732


In [13]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8496


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

# assume that X_train and y_train are the training data and labels
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# create and train the Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# evaluate the performance of the classifier on the test set
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

Accuracy: 0.85, Precision: 0.85, Recall: 0.85


In [15]:
report = classification_report(y_test, y_pred)
print('Classification Report:\n', report)

Classification Report:
               precision    recall  f1-score   support

    negative       0.83      0.88      0.85      4961
    positive       0.87      0.82      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [16]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# Load data
data = pd.read_csv('/workspaces/codespaces-jupyter/data/IMDB Dataset.csv')
X = data['review']
y = data['sentiment']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Fit pipeline on training data
pipeline.fit(X_train, y_train)

# Predict on testing data
y_pred = pipeline.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='positive')
recall = recall_score(y_test, y_pred, pos_label='positive')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Accuracy: 0.8488
Precision: 0.8730696001692405
Recall: 0.8190117086723556


In [18]:
y_pred = pipeline.predict(X_test)

In [19]:
import pickle

# Assuming that you've already trained your model and stored it in a variable called 'pipeline'

# Save the trained pipeline to file using pickle
with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

In [20]:
with open('/workspaces/codespaces-jupyter/sentiment_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [21]:
import gradio as gr
import pickle

# load the trained pipeline from file using pickle
with open('/workspaces/codespaces-jupyter/sentiment_model.pkl', 'rb') as f:
    loaded_pipeline = pickle.load(f)

# define a function to make sentiment predictions on input text
def predict_sentiment(text):
    sentiment = loaded_pipeline.predict([text])[0]
    return sentiment

# define the Gradio interface
input_review = gr.inputs.Textbox(label="Input Review")
output_sentiment = gr.outputs.Label(label="Sentiment")

gr.Interface(fn=predict_sentiment, inputs=input_review, outputs=output_sentiment, title="Sentiment Analysis Model"  ).launch()



Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


