In [3]:
import numpy as np
import joblib
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('output/translated_cleaned.csv')
print("Dataset loaded.")

# Function to assign sentiment labels based on polarity
def assign_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

# Ensure the sentiment mapping for the classifier
sentiment_mapping = {'positive': 0, 'neutral': 1, 'negative': 2}
df['sentiment'] = df['translated'].map(sentiment_mapping)

# Split the data into features (X) and labels (y)
X = df['translated']
y = df['sentiment'].astype(int)

# Create a pipeline with TfidfVectorizer and a classifier (we'll use MultinomialNB and SVC)
# Use GridSearchCV for hyperparameter tuning

# Define a pipeline with Multinomial Naive Bayes
nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10000)),
    ('clf', MultinomialNB())
])

# Define parameters for hyperparameter tuning for Multinomial Naive Bayes
nb_params = {
    'tfidf__max_features': [5000, 10000, 20000],
    'clf__alpha': [0.1, 0.5, 1.0]
}

# Perform GridSearchCV for Multinomial Naive Bayes
nb_grid_search = GridSearchCV(nb_pipeline, nb_params, cv=5, scoring='accuracy')
nb_grid_search.fit(X, y)

print(f"Best parameters for Multinomial Naive Bayes: {nb_grid_search.best_params_}")

# Define a pipeline with Support Vector Machine
svc_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10000)),
    ('clf', SVC(probability=True, class_weight='balanced'))
])

# Define parameters for hyperparameter tuning for SVM
svc_params = {
    'tfidf__max_features': [5000, 10000, 20000],
    'clf__kernel': ['linear', 'rbf'],
    'clf__C': [0.1, 1, 10],
    'clf__gamma': ['scale', 'auto']
}

# Perform GridSearchCV for SVM
svc_grid_search = GridSearchCV(svc_pipeline, svc_params, cv=5, scoring='accuracy')
svc_grid_search.fit(X, y)

print(f"Best parameters for Support Vector Machine: {svc_grid_search.best_params_}")

# Evaluate the best models on the testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the models
nb_model = nb_grid_search.best_estimator_
svc_model = svc_grid_search.best_estimator_

# Predict and evaluate using Multinomial Naive Bayes
nb_model.fit(X_train, y_train)
nb_y_pred = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_y_pred)
print(f"Multinomial Naive Bayes Accuracy: {nb_accuracy:.2%}")

# Predict and evaluate using SVM
svc_model.fit(X_train, y_train)
svc_y_pred = svc_model.predict(X_test)
svc_accuracy = accuracy_score(y_test, svc_y_pred)
print(f"Support Vector Machine Accuracy: {svc_accuracy:.2%}")

# Calculate confusion matrices for both models
nb_conf_matrix = confusion_matrix(y_test, nb_y_pred)
svc_conf_matrix = confusion_matrix(y_test, svc_y_pred)

# Display confusion matrices
cm_display_nb = ConfusionMatrixDisplay(confusion_matrix=nb_conf_matrix, display_labels=["positive", "neutral", "negative"])
cm_display_nb.plot()
plt.title("Confusion Matrix - Multinomial Naive Bayes")
plt.savefig("output/Confusion_Matrix_Multinomial_Naive_Bayes.png")

cm_display_svc = ConfusionMatrixDisplay(confusion_matrix=svc_conf_matrix, display_labels=["positive", "neutral", "negative"])
cm_display_svc.plot()
plt.title("Confusion Matrix - Support Vector Machine")
plt.savefig("output/Confusion_Matrix_Support_Vector_Machine.png")

# Save the best models and the TfidfVectorizer
print("Saving best models...")
joblib.dump(nb_model, 'output/models/tensorflow/best_nb_model.joblib')
joblib.dump(svc_model, 'output/models/tensorflow/best_svc_model.joblib')

print("Saving TfidfVectorizer...")
joblib.dump(nb_grid_search.best_estimator_.named_steps['tfidf'], 'output/models/tensorflow/tfidf_vectorizer_nb.pkl')
joblib.dump(svc_grid_search.best_estimator_.named_steps['tfidf'], 'output/models/tensorflow/tfidf_vectorizer_svc.pkl')

# Print classification reports
print("Classification report for Multinomial Naive Bayes:")
print(classification_report(y_test, nb_y_pred))

print("Classification report for Support Vector Machine:")
print(classification_report(y_test, svc_y_pred))

Loading dataset...
Dataset loaded.


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer