In [1]:
import joblib
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline

# 下載數據集
nltk.download('movie_reviews')
nltk.download('stopwords')

# 加載數據集
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# 將數據轉換為文本和標籤
X = [' '.join(words) for words, category in documents]
y = [category for words, category in documents]

# 將標籤轉換為數字
y = [1 if label == 'pos' else 0 for label in y]

# 分割數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print()

# 創建管道並訓練模型
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(kernel='linear', C=1.0))
])
model.fit(X_train, y_train)

# 保存模型
joblib.dump(model, 'model.pkl')


[nltk_data] Downloading package movie_reviews to C:\Users\Eleanor
[nltk_data]     Lin\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Eleanor
[nltk_data]     Lin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!





['model.pkl']

In [3]:
X_train

['brian de palma \' s snake eyes stars nicolas cage \' s evil twin , who confusingly uses the same stage name as his talented brother . like a foreign tourist who screams his lines in english to ensure that he will be understood , cage yells with the ferocity of a man with a bad case of caffeine overload . de palma , whose last great film , the untouchables , was crafted over a decade ago seems to have lost his magic . in snake eyes , he manages to elicit some of the worst performances possible out of a skilled cast . only gary sinise rises slightly above the hackneyed material . the rest of the actors become caricatures in this by - the - numbers thriller . ryuichi sakamoto \' s atmospheric and melodramatic music dominates almost every scene . heavy on the long violin notes , its rhythm is punctuated by thunder . ( the script by de palma and lost world \' s david koepp sets the action during a hurricane in an attempt to pump up the adrenaline level and the noise . only the latter is a

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import joblib  # Corrected import statement

# Load the movie review dataset from NLTK
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords

# Create a DataFrame from the NLTK movie review dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
df = pd.DataFrame(documents, columns=['text', 'label'])

# Preprocess the text data
df['text'] = df['text'].apply(lambda x: ' '.join(x))
df['text'] = df['text'].str.lower()

# Split the dataset into train and test sets
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# Create a pipeline with TfidfVectorizer and SVC
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('clf', SVC(kernel='linear', C=1.0))
])

# Fit the pipeline on the training data
pipeline.fit(train_df['text'], train_df['label'])

# Serialize the trained model to a file
joblib.dump(pipeline, 'movie_review_model.pkl')

# Load the model from the file
loaded_model = joblib.load('movie_review_model.pkl')

# Example inference
sample_text = "This movie was amazing! I loved every minute of it."
predicted_label = loaded_model.predict([sample_text])[0]
print(f"Predicted label for the sample text: {predicted_label}")
