In [None]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

In [None]:
dataset["train"]["text"][599]

In [33]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

In [47]:
# Assuming your dataset has columns 'label' and 'text'
df = pd.DataFrame({'text': dataset['train']['text'], 'label': dataset['train']['label']})

df = df[:10000]
print(df.count())

text     10000
label    10000
dtype: int64


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# Text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define the preprocess_text function
def preprocess_text_with_progress(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    words = [lemmatizer.lemmatize(word) for word in words]
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)


tqdm.pandas()
df['text'] = df['text'].progress_apply(preprocess_text_with_progress)

In [49]:
# Feature extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['text'])
y = df['label']

In [50]:
# Split the dataset into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Build and train the model (Naive Bayes)
model = MultinomialNB()
model.fit(X_train, y_train)

In [52]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(report)

In [None]:
# Assuming you've already trained and saved the model (model) and the vectorizer (tfidf_vectorizer) during training

# Sample text to classify
input_text = "Damn this is bad food make my stomach hurt"

# Preprocess the input text
input_text = preprocess_text_with_progress(input_text)

# Vectorize the input text using the same TF-IDF vectorizer
input_vector = tfidf_vectorizer.transform([input_text])

# Make predictions
predicted_label = model.predict(input_vector)[0]

# Print the result
if predicted_label == 1:
    print("Positive Sentiment")
else:
    print("Negative Sentiment")

In [None]:
!pip install transformers

In [None]:
!mkdir sentiment_classification

In [60]:
import pickle

# Save your NLTK model
with open('/sentiment_classification/nltk_sentiment_classification_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [None]:
!pip install huggingface_hub

In [None]:
!huggingface-cli login

In [84]:
nltk_model_path = '/sentiment_classification/nltk_sentiment_classification_model.pkl'
with open(nltk_model_path, 'rb') as model_file:
    nltk_model = pickle.load(model_file)


In [74]:
import torch
import pickle

In [85]:
# Save the NLTK model as a PyTorch model
torch.save(nltk_model, '/sentiment_classification/nltk_sentiment_classification_model.pt')

In [None]:
!huggingface-cli upload faizalnf1800/nltk_sentiment_classification /content/sentiment_classification/nltk_sentiment_classification_model.pkl nltk_sentiment_classification_model.pkl

In [None]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="faizalnf1800/nltk_sentiment_classification", filename="nltk_sentiment_classification_model.pkl", local_dir="/content")

In [None]:
pickled_model = pickle.load(open('nltk_sentiment_classification_model.pkl', 'rb'))
# pickled_model.predict(X_test)

# Assuming you've already trained and saved the model (model) and the vectorizer (tfidf_vectorizer) during training

# Sample text to classify
input_text = "Damn this is bad food make my stomach hurt"

# Preprocess the input text
input_text = preprocess_text_with_progress(input_text)

# Vectorize the input text using the same TF-IDF vectorizer
input_vector = tfidf_vectorizer.transform([input_text])

# Make predictions
predicted_label = pickled_model.predict(input_vector)[0]

# Print the result
if predicted_label == 1:
    print("Positive Sentiment")
else:
    print("Negative Sentiment")