In [56]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

# Load domain dataset
domain_data = pd.read_csv("C:/Users/dell/OneDrive/Desktop/dataset/Book2.csv")

# Load sentiment dataset
sentiment_data = pd.read_csv("cleaned_dataset.csv")

# Drop rows with missing values
domain_data.dropna(inplace=True)
sentiment_data.dropna(inplace=True)

# Define feature extraction functions
def extract_domain_features(text):
    # Example: CountVectorizer
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(text)
    return X

def extract_sentiment_features(text):
    # Example: CountVectorizer
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(text)
    return X

# Separate features and target variable for sentiment dataset
X_sentiment = sentiment_data['Tweets']
y_sentiment = sentiment_data['Sentiment']

# Balance the sentiment dataset
oversampler = RandomOverSampler()
X_sentiment_balanced, y_sentiment_balanced = oversampler.fit_resample(X_sentiment.values.reshape(-1, 1), y_sentiment)

# Separate features and target variable for domain dataset
X_domain = domain_data['Tweets']
y_domain = domain_data['Category']

# Convert text data to numerical features
vectorizer_sentiment = CountVectorizer()
X_sentiment_vectorized = vectorizer_sentiment.fit_transform(X_sentiment_balanced.reshape(-1))

vectorizer_domain = CountVectorizer()
X_domain_vectorized = vectorizer_domain.fit_transform(X_domain)

# Splitting sentiment data into training and testing sets
X_train_sentiment, X_test_sentiment, y_train_sentiment, y_test_sentiment = train_test_split(X_sentiment_vectorized, y_sentiment_balanced, test_size=0.2, random_state=42)

# Naive Bayes model for sentiment
nb_model = MultinomialNB()
nb_model.fit(X_train_sentiment, y_train_sentiment)
nb_predictions_sentiment = nb_model.predict(X_test_sentiment)
nb_accuracy_sentiment = accuracy_score(y_test_sentiment, nb_predictions_sentiment)
print("Naive Bayes Accuracy (Sentiment):", nb_accuracy_sentiment)

# Random Forest model for sentiment
rf_model_sentiment = RandomForestClassifier()
rf_model_sentiment.fit(X_train_sentiment, y_train_sentiment)
rf_predictions_sentiment = rf_model_sentiment.predict(X_test_sentiment)
rf_accuracy_sentiment = accuracy_score(y_test_sentiment, rf_predictions_sentiment)
print("Random Forest Accuracy (Sentiment):", rf_accuracy_sentiment)

# SVM model for sentiment
svm_model = SVC()
svm_model.fit(X_train_sentiment, y_train_sentiment)
svm_predictions_sentiment = svm_model.predict(X_test_sentiment)
svm_accuracy_sentiment = accuracy_score(y_test_sentiment, svm_predictions_sentiment)
print("SVM Accuracy (Sentiment):", svm_accuracy_sentiment)

# Splitting domain data into training and testing sets
X_train_domain, X_test_domain, y_train_domain, y_test_domain = train_test_split(X_domain_vectorized, y_domain, test_size=0.2, random_state=42)

# Random Forest model for domain
rf_model_domain = RandomForestClassifier()
rf_model_domain.fit(X_train_domain, y_train_domain)
rf_predictions_domain = rf_model_domain.predict(X_test_domain)
rf_accuracy_domain = accuracy_score(y_test_domain, rf_predictions_domain)
print("Random Forest Accuracy (Domain):", rf_accuracy_domain)

# Naive Bayes model for domain
nb_model_domain = MultinomialNB()
nb_model_domain.fit(X_train_domain, y_train_domain)
nb_predictions_domain = nb_model_domain.predict(X_test_domain)
nb_accuracy_domain = accuracy_score(y_test_domain, nb_predictions_domain)
print("Naive Bayes Accuracy (Domain):", nb_accuracy_domain)

# SVM model for domain
svm_model_domain = SVC()
svm_model_domain.fit(X_train_domain, y_train_domain)
svm_predictions_domain = svm_model_domain.predict(X_test_domain)
svm_accuracy_domain = accuracy_score(y_test_domain, svm_predictions_domain)
print("SVM Accuracy (Domain):", svm_accuracy_domain)



Naive Bayes Accuracy (Sentiment): 0.8068356593563087
Random Forest Accuracy (Sentiment): 0.9648533181429793
SVM Accuracy (Sentiment): 0.9201936770150955
Random Forest Accuracy (Domain): 0.9447236180904522
Naive Bayes Accuracy (Domain): 0.9095477386934674
SVM Accuracy (Domain): 0.9447236180904522


In [57]:
def ensemble_predict(text):
    # Extract sentiment features from the text using the pre-fitted vectorizer
    sentiment_features = vectorizer_sentiment.transform([text])
    
    # Predict sentiment using the ensemble of models
    nb_sentiment = tuple(nb_model.predict(sentiment_features))
    rf_sentiment = tuple(rf_model_sentiment.predict(sentiment_features))
    svm_sentiment = tuple(svm_model.predict(sentiment_features))

    # Ensemble for sentiment prediction by taking majority vote
    sentiment_prediction = max(set([nb_sentiment, rf_sentiment, svm_sentiment]), key = list([nb_sentiment, rf_sentiment, svm_sentiment]).count)
    
    # Extract domain features from the text using the pre-fitted vectorizer
    domain_features = vectorizer_domain.transform([text])
    
    # Predict domain using the ensemble of models
    rf_domain = tuple(rf_model_domain.predict(domain_features))
    nb_domain = tuple(nb_model_domain.predict(domain_features))
    svm_domain = tuple(svm_model_domain.predict(domain_features))

    # Ensemble for domain prediction by taking majority vote
    domain_prediction = max(set([rf_domain, nb_domain, svm_domain]), key = list([rf_domain, nb_domain, svm_domain]).count)

    return domain_prediction, sentiment_prediction


In [54]:
# Get number of inputs from user
n = int(input("Enter the number of texts: "))

# Get inputs from user and make predictions
for i in range(n):
    user_text = input("Enter text {}: ".format(i+1))
    
    # Make predictions using ensemble model
    domain_pred, sentiment_pred = ensemble_predict(user_text)
    
    # Print the given tweet, predicted sentiment, and predicted domain
    print("Predicted Sentiment:", sentiment_pred)
    print("Predicted Domain:", domain_pred)
    print()  # Print a blank line for better readability between inputs


Enter the number of texts: 3
Enter text 1: This film is one of the best film I have seen in my life.
Predicted Sentiment: ('Positive',)
Predicted Domain: ('cinema',)

Enter text 2: The AI is the worst technology.
Predicted Sentiment: ('Negative',)
Predicted Domain: ('technology',)

Enter text 3: Tennis match is so boring.
Predicted Sentiment: ('Negative',)
Predicted Domain: ('sports',)



In [24]:
def ensemble_accuracy(y_true_domain, y_true_sentiment, domain_pred, sentiment_pred):
   
    # Calculate accuracy for domain prediction
    domain_accuracy = accuracy_score(y_true_domain, domain_pred)

    # Calculate accuracy for sentiment prediction
    sentiment_accuracy = accuracy_score(y_true_sentiment, sentiment_pred)

    return domain_accuracy, sentiment_accuracy

# True labels for domain and sentiment
y_true_domain = y_test_domain
y_true_sentiment = y_test_sentiment

# Predictions from ensemble model
domain_pred, sentiment_pred = ensemble_predict(text)

# Calculate accuracy
domain_accuracy, sentiment_accuracy = ensemble_accuracy(y_test_domain, y_test_sentiment, rf_predictions_domain, rf_predictions_sentiment)

print("Ensemble Domain Accuracy:", domain_accuracy)
print("Ensemble Sentiment Accuracy:", sentiment_accuracy)



Ensemble Domain Accuracy: 0.9396984924623115
Ensemble Sentiment Accuracy: 0.9634861862717174


In [44]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

# Combine predictions from all models for domain classification
ensemble_predictions_domain = [rf_predictions_domain, nb_predictions_domain, svm_predictions_domain]
# Ensemble prediction for domain by taking majority vote
ensemble_prediction_domain = [max(set(predictions), key=predictions.count) for predictions in zip(*ensemble_predictions_domain)]

# Combine predictions from all models for sentiment classification
ensemble_predictions_sentiment = [rf_predictions_sentiment, nb_predictions_sentiment, svm_predictions_sentiment]
# Ensemble prediction for sentiment by taking majority vote
ensemble_prediction_sentiment = [max(set(predictions), key=predictions.count) for predictions in zip(*ensemble_predictions_sentiment)]

# Compute accuracy for ensemble domain classification
ensemble_accuracy_domain = accuracy_score(y_test_domain, ensemble_prediction_domain)
# Compute precision, recall, and F1-score for ensemble domain classification
precision_domain = precision_score(y_test_domain, ensemble_prediction_domain, average='weighted')
recall_domain = recall_score(y_test_domain, ensemble_prediction_domain, average='weighted')
f1_domain = f1_score(y_test_domain, ensemble_prediction_domain, average='weighted')

# Compute accuracy for ensemble sentiment classification
ensemble_accuracy_sentiment = accuracy_score(y_test_sentiment, ensemble_prediction_sentiment)
# Compute precision, recall, and F1-score for ensemble sentiment classification
precision_sentiment = precision_score(y_test_sentiment, ensemble_prediction_sentiment, average='weighted')
recall_sentiment = recall_score(y_test_sentiment, ensemble_prediction_sentiment, average='weighted')
f1_sentiment = f1_score(y_test_sentiment, ensemble_prediction_sentiment, average='weighted')

# Print evaluation metrics for the ensemble
print("Ensemble Domain Classification Metrics:")
print("Accuracy:", ensemble_accuracy_domain)
print("Precision:", precision_domain)
print("Recall:", recall_domain)
print("F1-score:", f1_domain)
print("Classification Report for Domain Classification:")
print(classification_report(y_test_domain, ensemble_prediction_domain))

print("\nEnsemble Sentiment Classification Metrics:")
print("Accuracy:", ensemble_accuracy_sentiment)
print("Precision:", precision_sentiment)
print("Recall:", recall_sentiment)
print("F1-score:", f1_sentiment)
print("Classification Report for Sentiment Classification:")
print(classification_report(y_test_sentiment, ensemble_prediction_sentiment))


Ensemble Domain Classification Metrics:
Accuracy: 0.949748743718593
Precision: 0.9495877656681676
Recall: 0.949748743718593
F1-score: 0.9496329486454287
Classification Report for Domain Classification:
              precision    recall  f1-score   support

      cinema       0.94      0.93      0.93        67
      sports       0.94      0.94      0.94        63
  technology       0.97      0.99      0.98        69

    accuracy                           0.95       199
   macro avg       0.95      0.95      0.95       199
weighted avg       0.95      0.95      0.95       199


Ensemble Sentiment Classification Metrics:
Accuracy: 0.9335801765878667
Precision: 0.9347886691918005
Recall: 0.9335801765878667
F1-score: 0.9334193279561557
Classification Report for Sentiment Classification:
              precision    recall  f1-score   support

    Negative       0.92      0.96      0.94      5877
     Neutral       0.97      0.89      0.93      5756
    Positive       0.92      0.96      0.94

In [26]:
import joblib

# Saving the sentiment ensemble model
joblib.dump((nb_model, rf_model_sentiment, svm_model, vectorizer_sentiment), 'sentiment_ensemble_model.pkl')

# Saving the domain model
joblib.dump((rf_model_domain, vectorizer_domain), 'domain_model.pkl')

print("Sentiment ensemble model and domain model saved successfully.")


Sentiment ensemble model and domain model saved successfully.


In [55]:
import joblib

# Saving the sentiment ensemble model
joblib.dump((nb_model, rf_model_sentiment, svm_model, vectorizer_sentiment), 'sentiment_ensemble_model.pkl')

# Saving the domain ensemble model
joblib.dump((nb_model_domain, svm_model_domain, rf_model_domain, vectorizer_domain), 'domain_ensemble_model.pkl')

print("Sentiment ensemble model and domain ensemble model saved successfully.")


Sentiment ensemble model and domain ensemble model saved successfully.
