In [2]:
import mlflow
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from tools.data_preprocess import load_data
import joblib as jb
import pandas as pd
import numpy as np
from tools.data_preprocess import load_data
from tools.data_preprocess import ClassicPreprocessorSpacy

In [14]:
# Best pipelines
RANDOM_FOREST = 'runs:/1b78da0367b94391b558810ba17b1660/best_pipeline'
NAIVE_BAYES = 'runs:/271da4ca0b3d4e35841b59cf2b77f015/best_pipeline'
SVM = 'runs:/07c752ee8102464fa82b73571383925c/best_pipeline'
LOG_REG = 'runs:/7e96a95d34094e22af7dad9afe84fc21/best_pipeline'
CATBOOST = 'runs:/32ba64e14eb94c278236aca7642a4e75/best_model'
BERT = 'runs:/b2f1f4fb5fdf4076a7d0c5a1a97d156b/bert_model'

def save_model(name):
    # Create dataframe of models and save it as model_v1.csv
    models = pd.DataFrame({"Model": ["Random Forest", "Naive Bayes", "SVM", "Logistic Regression", "CatBoost", "BERT"],
                           "Pipeline": [RANDOM_FOREST, NAIVE_BAYES, SVM, LOG_REG, CATBOOST, BERT]})
    models.to_csv(f"data/test_datasets/{name}", index=False)
    print(models)


                 Model                                           Pipeline
0        Random Forest  runs:/1b78da0367b94391b558810ba17b1660/best_pi...
1          Naive Bayes  runs:/271da4ca0b3d4e35841b59cf2b77f015/best_pi...
2                  SVM  runs:/07c752ee8102464fa82b73571383925c/best_pi...
3  Logistic Regression  runs:/7e96a95d34094e22af7dad9afe84fc21/best_pi...
4             CatBoost  runs:/32ba64e14eb94c278236aca7642a4e75/best_model
5                 BERT  runs:/b2f1f4fb5fdf4076a7d0c5a1a97d156b/bert_model


In [3]:
# Use new test data to evaluate models with reviews from a different source
DATA_PATH_ROTTEN = "data/rotten_tomatoes_movie_reviews.csv"

# Load data
data = pd.read_csv(DATA_PATH_ROTTEN)

# Explore data
print(len(data))
print(data.columns)
print(data.head())


1444963
Index(['id', 'reviewId', 'creationDate', 'criticName', 'isTopCritic',
       'originalScore', 'reviewState', 'publicatioName', 'reviewText',
       'scoreSentiment', 'reviewUrl'],
      dtype='object')
                                  id  reviewId creationDate       criticName  \
0                            beavers   1145982   2003-05-23  Ivan M. Lincoln   
1                         blood_mask   1636744   2007-06-02    The Foywonder   
2  city_hunter_shinjuku_private_eyes   2590987   2019-05-28     Reuben Baron   
3  city_hunter_shinjuku_private_eyes   2558908   2019-02-14      Matt Schley   
4                 dangerous_men_2015   2504681   2018-08-29        Pat Padua   

   isTopCritic originalScore reviewState                 publicatioName  \
0        False         3.5/4       fresh  Deseret News (Salt Lake City)   
1        False           1/5      rotten                  Dread Central   
2        False           NaN       fresh                            CBR   
3        

In [4]:
# Only keep the id, review and the sentiment
data = data[["id", "reviewText", "scoreSentiment"]]

In [5]:
# Check for empty values
print(data.isnull().sum())
# Remove empty values
data = data.dropna()

id                    0
reviewText        69225
scoreSentiment        0
dtype: int64


In [6]:
# Check for number of positive and negative reviews
print(data["scoreSentiment"].value_counts())

# Unbalanced dataset -> need to balance it for better evaluation
# As we have enough data, we can downsample the positive reviews

scoreSentiment
POSITIVE    922510
NEGATIVE    453228
Name: count, dtype: int64


In [7]:
# Apply preprocessing to the data to create column with tokens and turn sentiment into binary
def preprocess():
    # Load preprocessor
    preprocessor = jb.load("vectorizer/preprocessor.pkl")
    
    
    #Create preprocessed data as additional column named "preprocessed" for classical models
    data["preprocessed"] = preprocessor.transform(data[["reviewText"]])
    # turn preprocessed into string of tokens
    data["preprocessed"] = data["preprocessed"].apply(lambda x: " ".join(x)).astype(str)
    
    # Turn sentiment into binary
    data["sentiment"] = data["scoreSentiment"].apply(lambda x: 1 if x == "POSITIVE" else 0)
    data.drop(columns=["scoreSentiment"], inplace=True)
    
    # Preprocessing may have created empty values -> check and remove them
    # show  empty values
    print(data.isnull().sum())
    # delete empty values
    data.dropna(inplace=True)
    
    print(data.head())
    return data

# Apply preprocessing (only once) to create data for classical models
#data = preprocess()

                                  id  \
0                            beavers   
1                         blood_mask   
2  city_hunter_shinjuku_private_eyes   
3  city_hunter_shinjuku_private_eyes   
4                 dangerous_men_2015   

                                          reviewText  \
0  Timed to be just long enough for most youngste...   
1  It doesn't matter if a movie costs 300 million...   
2  The choreography is so precise and lifelike at...   
3  The film's out-of-touch attempts at humor may ...   
4  Its clumsy determination is endearing and some...   

                                        preprocessed  sentiment  
0  time long youngster brief attention span pack ...          1  
1  matter movie cost million dollar good good bad...          0  
2  choreography precise lifelike point wonder mov...          1  
3  film touch attempt humor find hunt reason fran...          0  
4       clumsy determination endear wildly entertain          1  


In [8]:

# Save preprocessed data
data.to_csv("data/test_datasets/rotten_tomatoes_movie_reviews_preprocessed_all.csv", index=False)

id              0
reviewText      0
preprocessed    0
sentiment       0
dtype: int64
id              object
reviewText      object
preprocessed    object
sentiment        int64
dtype: object


In [10]:
# Sample data to balance the classes and reduce complexity
def sampler(data, n_samples):
    # Sample the positive reviews
    positive = data[data["sentiment"] == 1].sample(n=n_samples, random_state=42)
    # Sample the negative reviews
    negative = data[data["sentiment"] == 0].sample(n=n_samples, random_state=42)
    # Concatenate the samples
    data = pd.concat([positive, negative])
    data.to_csv("data/test_datasets/rotten_tomatoes_movie_reviews_preprocessed.csv", index=False)
    # Check the new distribution
    print(data["sentiment"].value_counts())
    return data

# Sample the data chose 200000 samples each to reduce complexity a little bit
# data = sampler(data, 200000)


sentiment
1    200000
0    200000
Name: count, dtype: int64


In [12]:
# Predict with models
def predict_classical():
    # Load models
    rf = mlflow.sklearn.load_model(RANDOM_FOREST)
    nb = mlflow.sklearn.load_model(NAIVE_BAYES)
    svm = mlflow.sklearn.load_model(SVM)
    log_reg = mlflow.sklearn.load_model(LOG_REG)
    catboost = mlflow.sklearn.load_model(CATBOOST)
    # Load preprocessed data
    data = pd.read_csv("data/test_datasets/rotten_tomatoes_movie_reviews_preprocessed.csv")
    data.dropna(inplace=True)
    
    # Get predictions
    data["rf_pred"] = rf.predict(data["preprocessed"])
    data["nb_pred"] = nb.predict(data["preprocessed"])
    data["svm_pred"] = svm.predict(data["preprocessed"])
    data["log_reg_pred"] = log_reg.predict(data["preprocessed"])
    data["catboost_pred"] = catboost.predict(data["preprocessed"])
    
    # Save data with predictions to csv
    data.to_csv("data/test_datasets/rotten_predictions.csv", index=False)

# Apply prediction (only once)
if "rf_pred" not in data.columns:
    predict_classical()

In [7]:
# Add BERT scores seperately from the classical models as it is more complex
import transformers
from transformers import pipeline


def predict_bert():
    # Load data
    data = pd.read_csv("data/test_datasets/rotten_predictions.csv")
    # Load BERT model
    bert = mlflow.transformers.load_model(BERT)
    tokenizer = bert.tokenizer
    model = bert.model
    
    
    # Create a pipeline for sentiment analysis
    bert_classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
    
    # Predict with BERT
    predictions = bert_classifier(data["reviewText"].tolist())
    # Extract sentiment from predictions
    data["bert_pred"] = [1 if x["label"]=="LABEL_1" else 0 for x in predictions]
    
    # Save data with BERT predictions
    data.to_csv("data/test_datasets/rotten_predictions_final.csv", index=False)
    print(data.head())

predict_bert()



2025/01/30 13:37:21 INFO mlflow.transformers: 'runs:/b2f1f4fb5fdf4076a7d0c5a1a97d156b/bert_model' resolved as 'file:///C:/python/SpamDetection/mlruns/348789547859170955/b2f1f4fb5fdf4076a7d0c5a1a97d156b/artifacts/bert_model'
Device set to use cuda:0
Device set to use cuda:0


                                   id  \
0                       the_salvation   
1                             belfast   
2   standing_in_the_shadows_of_motown   
3  1133712-reno_rebel_without_a_pause   
4                   my_darling_vivian   

                                          reviewText  \
0  A film reminiscent of a song that sounds famil...   
1      Works gorgeously as an idealised memory play.   
2  The brothers missed out on glory back in the d...   
3  Even though the film runs a brief 71 minutes -...   
4  A purely informational bio-doc that nonetheles...   

                                        preprocessed  sentiment  rf_pred  \
0  film reminiscent song sound familiar offer ple...          1        1   
1              work gorgeously idealised memory play          1        1   
2  brother miss glory day help think cinematic la...          1        1   
3  film run brief minute finish footage reno talk...          1        1   
4  purely informational bio doc none

In [1]:

# Evaluate models and create a table with the results including true positive, false positive, true negative, false negative
def evaluate_metrics(path="data/test_datasets/rotten_predictions_final.csv"):
    # Load data with predictions
    data = pd.read_csv(path)
    results = []
    for model in ["rf", "nb", "svm", "log_reg", "catboost", "bert"]:
        y_true = data["sentiment"]
        y_pred = data[model + "_pred"]
        
        f1 = f1_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        
        results.append([model, f1, accuracy, precision, recall])
    
    results = pd.DataFrame(results, columns=["Model", "F1", "Accuracy", "Precision", "Recall"])
    #results.to_csv("data/test_datasets/rotten_results.csv", index=False)
    return results

def evaluate_labels(path="data/test_datasets/rotten_predictions_final.csv"):
    data = pd.read_csv(path)
    results = []
    for model in ["rf", "nb", "svm", "log_reg", "catboost", "bert"]:
        y_true = data["sentiment"]
        y_pred = data[model + "_pred"]
        
        tp = np.sum((y_true == 1) & (y_pred == 1))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        tn = np.sum((y_true == 0) & (y_pred == 0))
        fn = np.sum((y_true == 1) & (y_pred == 0))
        
        results.append([model, tp, fp, tn, fn])
    
    results = pd.DataFrame(results, columns=["Model", "TP", "FP", "TN", "FN"])
    #results.to_csv("data/test_datasets/rotten_confusion_matrix.csv", index=False)
    return results


# Apply evaluation
results = evaluate_metrics()
print(results)
conf_mat = evaluate_labels()
print(conf_mat)

NameError: name 'pd' is not defined