In [1]:
import pandas as pd
import re
import contractions
import demoji
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, ConfusionMatrixDisplay)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
nltk.download(['punkt', 'stopwords', 'wordnet', 'omw-1.4'])
demoji.download_codes()

In [None]:
df = pd.read_csv('IMDB.csv')
print("Dataset Shape:", df.shape)
print("\nClass Distribution:\n", df['sentiment'].value_counts())

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x='sentiment', data=df)
plt.title('Sentiment Class Distribution')
plt.show()

In [None]:
def preprocess_text(text):
    """Comprehensive text cleaning and preprocessing pipeline"""
    # Initial cleaning
    text = re.sub(r'http\S+|www\S+|https\S+|<.*?>', '', text)
    text = contractions.fix(text)
    text = demoji.replace_with_desc(text, sep=" ")
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization and processing
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    processed_tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word not in stop_words and len(word) > 2
    ]
    
    return ' '.join(processed_tokens)

In [None]:
df['cleaned_review'] = df['review'].apply(preprocess_text)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    df['cleaned_review'],
    df['sentiment'],
    test_size=0.2,
    stratify=df['sentiment'],
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=42
)

In [None]:
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words='english'
)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
models = {
    'SVM': SVC(kernel='linear', C=1.0, class_weight='balanced', probability=True),
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000)
}

In [None]:
results = []

# Train and evaluate models
for model_name, model in models.items():
    print(f"\n{'='*40}\nTraining {model_name}...\n{'='*40}")
    
    # Training
    model.fit(X_train_tfidf, y_train)
    
    # Validation predictions
    y_val_pred = model.predict(X_val_tfidf)
    y_val_prob = model.predict_proba(X_val_tfidf)[:, 1]
    
    # Store results
    results.append({
        'Model': model_name,
        'Validation Accuracy': accuracy_score(y_val, y_val_pred),
        'Classification Report': classification_report(y_val, y_val_pred, output_dict=True)
    })
    
    # Print metrics
    print(f"\n{model_name} Validation Results:")
    print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
    print(classification_report(y_val, y_val_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_val, y_val_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                                display_labels=model.classes_)
    disp.plot(cmap='Blues')
    plt.title(f'{model_name} Validation Confusion Matrix')
    plt.show()



In [None]:
comparison_df = pd.DataFrame(results)

In [None]:
metrics = []
for result in results:
    report = result['Classification Report']
    metrics.append({
        'Model': result['Model'],
        'Accuracy': result['Validation Accuracy'],
        'Precision (Positive)': report['positive']['precision'],
        'Recall (Positive)': report['positive']['recall'],
        'F1 (Positive)': report['positive']['f1-score'],
        'Precision (Negative)': report['negative']['precision'],
        'Recall (Negative)': report['negative']['recall'],
        'F1 (Negative)': report['negative']['f1-score']
    })

metric_df = pd.DataFrame(metrics).set_index('Model')
print("\nModel Comparison Metrics:")
print(metric_df.T)

In [None]:
plt.figure(figsize=(10, 6))
metric_df[['Accuracy']].plot(kind='bar', rot=0)
plt.title('Model Accuracy Comparison')
plt.ylabel('Score')
plt.ylim(0.7, 1.0)
plt.show()


In [None]:
best_model_name = max(results, key=lambda x: x['Validation Accuracy'])['Model']
best_model = models[best_model_name]

print(f"\n{'='*40}\nEvaluating Best Model ({best_model_name}) on Test Set\n{'='*40}")


In [None]:
y_test_pred = best_model.predict(X_test_tfidf)
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(classification_report(y_test, y_test_pred))

In [None]:
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(best_model, 'best_model.pkl')

In [None]:
def predict_sentiment(text, model=best_model):
    """End-to-end sentiment prediction pipeline"""
    cleaned_text = preprocess_text(text)
    vectorized_text = tfidf.transform([cleaned_text])
    return model.predict(vectorized_text)[0]

In [None]:
test_reviews = [
    "This movie was an absolute masterpiece! The acting was superb.",
    "Terrible experience from start to finish. Waste of money.",
    "The product works okay, but nothing special for the price."
]

print("\nSample Predictions:")
for review in test_reviews:
    prediction = predict_sentiment(review)
    print(f"\nReview: {review}\nPredicted Sentiment: {prediction}")
    print("-"*60)