In [2]:
import nltk
import pandas as pd
import random

In [3]:
from nltk.corpus import movie_reviews

#nltk.download('movie_reviews')

In [4]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

documents = [(movie_reviews.raw(fileid), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [5]:
df = pd.DataFrame(documents, columns=['review_text', 'label'])
random.shuffle(documents)


In [6]:
from nltk.corpus import stopwords
#nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

In [7]:
from nltk.tokenize import word_tokenize

def preprocess(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [w for w in words if w.isalnum() and w not in stop_words]
    return ' '.join(words)
df['cleaned_review'] = df['review_text'].apply(preprocess)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
tf_idf_vectorizer = TfidfVectorizer(max_features=50)
tfidf_matrix = tf_idf_vectorizer.fit_transform(df['cleaned_review'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tf_idf_vectorizer.get_feature_names_out())

In [10]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import joblib

In [11]:
x = tfidf_df
y = df['label']

In [12]:
kf = KFold(n_splits=5,shuffle=True, random_state=42)

In [13]:
accuracies = []
best_accuracy = 0
best_model = None


In [14]:
fold = 1
for train_idx, test_idx in kf.split(x):
    x_train, x_test = x.iloc[train_idx], x.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model = LogisticRegression(max_iter=1000)
    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)
    print(f"Fold {fold} Accuracy: {score:.4f}")
    accuracies.append(score)

    if score > best_accuracy:
        best_accuracy = score
        best_model = model
        joblib.dump(best_model, "best_logistic_model.pkl")  # Save the best model

    fold += 1


Fold 1 Accuracy: 0.6675
Fold 2 Accuracy: 0.6875
Fold 3 Accuracy: 0.6900
Fold 4 Accuracy: 0.7300
Fold 5 Accuracy: 0.7050


In [15]:
print("\nK-Fold Mean Accuracy:", np.mean(accuracies))
print("K-Fold Best Accuracy:", best_accuracy)


K-Fold Mean Accuracy: 0.696
K-Fold Best Accuracy: 0.73


In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print("Holdout Accuracy:", accuracy_score(y_test, y_pred))

Holdout Accuracy: 0.6675


In [19]:
import pickle

# Assuming `X_tfidf` is your TF-IDF features and `y` is your label column
with open('tfidf_features.pkl', 'wb') as f:
    pickle.dump(tfidf_df, f)

with open('labels.pkl', 'wb') as f:
    pickle.dump(df['label'], f)
