In [95]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns

In [96]:
try:
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('stopwords', quiet=True)
    # Add download for punkt_tab to ensure word_tokenize has necessary resources
    nltk.download('punkt_tab', quiet=True)
except Exception as e:
    print(f"Error downloading NLTK resources: {e}")
    pass  # Handle case where downloads might fail

In [97]:
df = pd.read_csv('movie_reviews_sentiment.csv')
df.head()

Unnamed: 0,review,sentiment
0,Poor,0
1,Made no sense at all.,0
2,A must-watch for everyone.,1
3,Painful to sit through.,0
4,Painful to sit through.,0


In [98]:
sentiment_counts = df['sentiment'].value_counts()
sentiment_counts

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,1332
0,666


In [99]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [100]:
df['review_length'] = df['review'].apply(len)
df['review_length'].describe()

Unnamed: 0,review_length
count,1998.0
mean,14.802302
std,8.606542
min,3.0
25%,7.0
50%,13.5
75%,23.0
max,34.0


In [101]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Add movie-specific stopwords
    movie_stopwords = {'movie', 'film', 'watch', 'seen', 'see'}
    stop_words.update(movie_stopwords)

    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    return ' '.join(filtered_tokens)

In [102]:
df['processed_review'] = df['review'].apply(preprocess_text)
df

Unnamed: 0,review,sentiment,review_length,processed_review
0,Poor,0,4,poor
1,Made no sense at all.,0,21,made sense
2,A must-watch for everyone.,1,26,mustwatch everyone
3,Painful to sit through.,0,23,painful sit
4,Painful to sit through.,0,23,painful sit
...,...,...,...,...
1993,Superb,1,6,superb
1994,A stunning piece of cinema.,1,27,stunning piece cinema
1995,Regret watching it.,0,19,regret watching
1996,Loved every minute of it.,1,25,loved every minute


In [103]:
X_train, X_test, y_train, y_test = train_test_split(df["processed_review"], df["sentiment"], test_size=0.2, random_state=42)

In [104]:
len(X_train)

1598

In [105]:
len(X_test)

400

In [106]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

In [107]:
vectorizers = {
    'TF-IDF': TfidfVectorizer(),
    'Count Vectorizer': CountVectorizer()
}

In [108]:
results = []

for vectorizer_name, vectorizer in vectorizers.items():
    for model_name, model in models.items():
        print(f"\nTraining {model_name} with {vectorizer_name}...")

        # Create pipeline
        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', model)
        ])

        # Train model
        pipeline.fit(X_train, y_train)

        # Evaluate model
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        results.append({
            'Vectorizer': vectorizer_name,
            'Model': model_name,
            'Accuracy': accuracy
        })

        print(f"Accuracy: {accuracy:.4f}")


Training Logistic Regression with TF-IDF...
Accuracy: 1.0000

Training SVM with TF-IDF...
Accuracy: 1.0000

Training Random Forest with TF-IDF...
Accuracy: 1.0000

Training Logistic Regression with Count Vectorizer...
Accuracy: 1.0000

Training SVM with Count Vectorizer...
Accuracy: 1.0000

Training Random Forest with Count Vectorizer...
Accuracy: 1.0000


In [109]:
model_params = {
    'svm': {
        'model': Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('classifier', SVC(gamma='auto'))
        ]),
        'params': {
            'classifier__C': [1, 10, 20],
            'classifier__kernel': ['rbf', 'linear'],
            'tfidf__max_features': [5000, 10000],
            'tfidf__ngram_range': [(1, 1), (1, 2)]
        }
    },
    'random_forest': {
        'model': Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('classifier', RandomForestClassifier())
        ]),
        'params': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [None, 10, 20],
            'tfidf__max_features': [5000, 10000],
            'tfidf__ngram_range': [(1, 1), (1, 2)]
        }
    },
    'logistic_regression': {
        'model': Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('classifier', LogisticRegression(solver='liblinear', multi_class='auto', max_iter=1000))
        ]),
        'params': {
            'classifier__C': [0.1, 1, 5, 10],
            'tfidf__max_features': [5000, 10000],
            'tfidf__ngram_range': [(1, 1), (1, 2)]
        }
    }
}

In [110]:
scores = []
best_score = 0
best_model = None

for model_name, mp in model_params.items():
    print(f"\nTraining {model_name}...")


    clf = GridSearchCV(
        mp['model'],
        mp['params'],
        cv=5,
        return_train_score=False,
        n_jobs=-1
    )

    try:
        clf.fit(X_train, y_train)

        # Evaluate on test set
        y_pred = clf.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)

        # Save results
        scores.append({
            'model': model_name,
            'best_score': clf.best_score_,  # CV score
            'test_accuracy': test_accuracy,  # Test set score
            'best_params': clf.best_params_
        })

        print(f"Model: {model_name}")
        print(f"Best CV Score: {clf.best_score_:.4f}")
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Best Parameters: {clf.best_params_}")

        # Keep track of best model
        if clf.best_score_ > best_score:
            best_score = clf.best_score_
            best_model = clf
            best_model_name = model_name

    except Exception as e:
        print(f"Error training {model_name}: {str(e)}")

# Display all results
print("\n===== Model Comparison =====")
results_df = pd.DataFrame(scores)
print(results_df[['model', 'best_score', 'test_accuracy']])


Training svm...
Model: svm
Best CV Score: 1.0000
Test Accuracy: 1.0000
Best Parameters: {'classifier__C': 1, 'classifier__kernel': 'linear', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}

Training random_forest...
Model: random_forest
Best CV Score: 1.0000
Test Accuracy: 1.0000
Best Parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100, 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}

Training logistic_regression...
Model: logistic_regression
Best CV Score: 1.0000
Test Accuracy: 1.0000
Best Parameters: {'classifier__C': 1, 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}

===== Model Comparison =====
                 model  best_score  test_accuracy
0                  svm         1.0            1.0
1        random_forest         1.0            1.0
2  logistic_regression         1.0            1.0




In [111]:
scores

[{'model': 'svm',
  'best_score': np.float64(1.0),
  'test_accuracy': 1.0,
  'best_params': {'classifier__C': 1,
   'classifier__kernel': 'linear',
   'tfidf__max_features': 5000,
   'tfidf__ngram_range': (1, 1)}},
 {'model': 'random_forest',
  'best_score': np.float64(1.0),
  'test_accuracy': 1.0,
  'best_params': {'classifier__max_depth': None,
   'classifier__n_estimators': 100,
   'tfidf__max_features': 5000,
   'tfidf__ngram_range': (1, 1)}},
 {'model': 'logistic_regression',
  'best_score': np.float64(1.0),
  'test_accuracy': 1.0,
  'best_params': {'classifier__C': 1,
   'tfidf__max_features': 5000,
   'tfidf__ngram_range': (1, 1)}}]

In [112]:
best_model.score(X_test, y_test)

1.0

In [113]:
X_test

Unnamed: 0,processed_review
256,completely boring
352,brilliant
298,disappointment
581,weak
1287,made sense
...,...
650,stunning piece cinema
1436,typical
261,terrible plot worse acting
1304,wonderful


In [114]:
y_pred = best_model.predict(X_test)
y_pred

array([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,

In [115]:
import joblib
joblib.dump(pipeline, 'sentiment_model.pkl')

['sentiment_model.pkl']

In [118]:
best_model.predict(['boring'])

array([0])