In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

In [20]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tuntu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tuntu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and normalization
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Stopword removal
    stemmed_tokens = [ps.stem(word) for word in filtered_tokens]  # Stemming
    return ' '.join(stemmed_tokens)

In [22]:
music_reviews = []
labels = []

with open('music_reviews.txt', 'r') as file:
    for line in file:
        label, review = line.strip().split('\t')
        music_reviews.append(preprocess_text(review))
        labels.append(int(label == 'positive'))  # Convert 'positive' to 1 and 'negative' to 0

In [23]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(music_reviews)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

In [33]:
y_pred = model.predict(X_test)
#("Prediction:", y_pred)
accuracy = accuracy_score(y_test, y_pred)
#("Accuracy:", accuracy)

In [34]:
with open('music_review_model.pkl', 'wb') as file:
    pickle.dump((model, vectorizer), file)