In [None]:
# Load and preprocess
import pandas as pd
from sklearn.model_selection import train_test_split
from utils.preprocessing import load_and_clean_data

df = load_and_clean_data("data/raw/IMDB Dataset.csv")

# Split
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42, stratify=df['sentiment'])

# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# SVM Pipeline
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', LinearSVC())
])

svm_pipeline.fit(X_train, y_train)
svm_preds = svm_pipeline.predict(X_test)

print("SVM Performance:")
print(classification_report(y_test, svm_preds, target_names=le.classes_))




SVM Performance:
              precision    recall  f1-score   support

    negative       0.90      0.89      0.90      5000
    positive       0.89      0.90      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [3]:
# Random Forest Pipeline
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
rf_preds = rf_pipeline.predict(X_test)

print("Random Forest Performance:")
print(classification_report(y_test, rf_preds, target_names=le.classes_))


Random Forest Performance:
              precision    recall  f1-score   support

    negative       0.84      0.86      0.85      5000
    positive       0.85      0.84      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [10]:
import joblib

joblib.dump(svm_pipeline, 'model/ML/svm_model.joblib')
joblib.dump(rf_pipeline, 'model/ML/rf_model.joblib')


['model/ML/rf_model.joblib']

In [18]:
import joblib

# Load the pipeline model
svm_model = joblib.load('model/ML/svm_model.joblib')

# 1. Check the model type (this will show it's a pipeline)
print(f"Model Type: {type(svm_model)}")

# 2. Check the model's parameters (hyperparameters)
print(f"Model Parameters: {svm_model.get_params()}")

# 3. Access the classifier inside the pipeline (LinearSVC) and check coefficients and intercept
svm_classifier = svm_model.named_steps['clf']

# Check the coefficients (weights for each feature)
if hasattr(svm_classifier, 'coef_'):
    print(f"Model Coefficients: {svm_classifier.coef_}")

# Check the intercept (bias term)
if hasattr(svm_classifier, 'intercept_'):
    print(f"Model Intercept: {svm_classifier.intercept_}")


Model Type: <class 'sklearn.pipeline.Pipeline'>
Model Parameters: {'memory': None, 'steps': [('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))), ('clf', LinearSVC())], 'verbose': False, 'tfidf': TfidfVectorizer(max_features=10000, ngram_range=(1, 2)), 'clf': LinearSVC(), 'tfidf__analyzer': 'word', 'tfidf__binary': False, 'tfidf__decode_error': 'strict', 'tfidf__dtype': <class 'numpy.float64'>, 'tfidf__encoding': 'utf-8', 'tfidf__input': 'content', 'tfidf__lowercase': True, 'tfidf__max_df': 1.0, 'tfidf__max_features': 10000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__preprocessor': None, 'tfidf__smooth_idf': True, 'tfidf__stop_words': None, 'tfidf__strip_accents': None, 'tfidf__sublinear_tf': False, 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__tokenizer': None, 'tfidf__use_idf': True, 'tfidf__vocabulary': None, 'clf__C': 1.0, 'clf__class_weight': None, 'clf__dual': 'warn', 'clf__fit_intercept': True, 'clf__intercept_scaling': 1

In [19]:
import joblib

# Load the Random Forest pipeline model
rf_pipeline = joblib.load('model/ML/rf_model.joblib')

# 1. Check the model type (this will show it's a pipeline)
print(f"Model Type: {type(rf_pipeline)}")

# 2. Check the model's parameters (hyperparameters)
print(f"Model Parameters: {rf_pipeline.get_params()}")

# 3. Access the classifier inside the pipeline (RandomForestClassifier) and check feature importances and other attributes
rf_classifier = rf_pipeline.named_steps['clf']  # Assuming your pipeline step is named 'clf'

# Check the feature importances (weights for each feature)
if hasattr(rf_classifier, 'feature_importances_'):
    print(f"Feature Importances: {rf_classifier.feature_importances_}")

# Check the number of estimators (trees in the Random Forest)
if hasattr(rf_classifier, 'n_estimators'):
    print(f"Number of Estimators (Trees): {rf_classifier.n_estimators}")

# Optionally, check out-of-bag error score (if enabled)
if hasattr(rf_classifier, 'oob_score_'):
    print(f"Out-of-Bag Score: {rf_classifier.oob_score_}")


Model Type: <class 'sklearn.pipeline.Pipeline'>
Model Parameters: {'memory': None, 'steps': [('tfidf', TfidfVectorizer(max_features=10000)), ('clf', RandomForestClassifier(random_state=42))], 'verbose': False, 'tfidf': TfidfVectorizer(max_features=10000), 'clf': RandomForestClassifier(random_state=42), 'tfidf__analyzer': 'word', 'tfidf__binary': False, 'tfidf__decode_error': 'strict', 'tfidf__dtype': <class 'numpy.float64'>, 'tfidf__encoding': 'utf-8', 'tfidf__input': 'content', 'tfidf__lowercase': True, 'tfidf__max_df': 1.0, 'tfidf__max_features': 10000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1), 'tfidf__norm': 'l2', 'tfidf__preprocessor': None, 'tfidf__smooth_idf': True, 'tfidf__stop_words': None, 'tfidf__strip_accents': None, 'tfidf__sublinear_tf': False, 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__tokenizer': None, 'tfidf__use_idf': True, 'tfidf__vocabulary': None, 'clf__bootstrap': True, 'clf__ccp_alpha': 0.0, 'clf__class_weight': None, 'clf__criterion': 'gini', 'c