In [143]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from afinn import Afinn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from tensorflow.keras.regularizers import l2  # L2 regularization
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

In [144]:
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/faizelahmed/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/faizelahmed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [145]:
# Load the data
df = pd.read_csv('truthseeker_emotions.csv')

In [146]:
# Sentiment analysis
df['textblob'] = df['statement'].apply(lambda x: TextBlob(x).sentiment.polarity)
afinn = Afinn()
df['afinn'] = df['statement'].apply(lambda x: afinn.score(x))
analyzer = SentimentIntensityAnalyzer()
df['vader'] = df['statement'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

In [147]:
# Feature Engineering
df['statement_length'] = df['statement'].apply(len)
df['exclamation_count'] = df['statement'].apply(lambda x: x.count('!'))

In [148]:
# Preparing the input data
X = df['statement']
y = df['BinaryNumTarget']

In [149]:
# Train test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)  # Increased test size and stratified sampling


In [150]:
# Text vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [151]:
# Define Naive Bayes model
model1 = MultinomialNB()
model1.fit(X_train_vec, y_train)

In [152]:
# Define Passive Aggressive model with grid search and L2 regularization
param_grid = {'C': [0.01, 0.1, 1]}  # Reduced complexity
grid_search = GridSearchCV(PassiveAggressiveClassifier(max_iter=1000, tol=1e-3, random_state=42), param_grid, cv=5)
model2 = grid_search.fit(X_train_vec, y_train)


In [153]:
# Define Logistic Regression model
model4 = LogisticRegression()
model4.fit(X_train_vec, y_train)


In [154]:
# Define Random Forest model
model5 = RandomForestClassifier()
model5.fit(X_train_vec, y_train)

In [155]:
# Preparing the sentiment and emotion scores data
X2 = df[['textblob', 'afinn', 'vader', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', 'statement_length', 'exclamation_count']]
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.3, random_state=42, stratify=y)  # Increased test size and stratified sampling


In [156]:
# Scale the data
scaler = StandardScaler()
X2_train_scaled = scaler.fit_transform(X2_train)
X2_test_scaled = scaler.transform(X2_test)

In [157]:
# Define simple deep neural network model with dropout and L2 regularization
model3 = Sequential([
    Dense(8, activation='relu', kernel_regularizer=l2(0.2), input_shape=(X2_train_scaled.shape[1],)),  # Reduced complexity and increased regularization
    Dropout(0.7),  # Increased dropout
    Dense(4, activation='relu', kernel_regularizer=l2(0.2)),  # Reduced complexity and increased regularization
    Dropout(0.7),  # Increased dropout
    Dense(1, activation='sigmoid')
])

In [158]:
# Compile DNN model
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [159]:
# Train DNN model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)  # Increased patience
model3.fit(X2_train_scaled, y_train, epochs=30, validation_split=0.2, callbacks=[early_stopping])  # Increased number of epochs

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30


<keras.callbacks.History at 0x291d9da80>

In [160]:
# Predicting with all models
y_pred1 = model1.predict(X_test_vec)
y_pred2 = model2.predict(X_test_vec)
y_pred3 = (model3.predict(X2_test_scaled) > 0.5).astype(int)
y_pred4 = model4.predict(X_test_vec)
y_pred5 = model5.predict(X_test_vec)




In [161]:
# Ensemble prediction: majority vote
y_pred_ensemble = ((y_pred1 + y_pred2 + y_pred3.ravel() + y_pred4 + y_pred5) >= 3).astype(int)


In [162]:
# Evaluating the ensemble model
print("Accuracy: ", accuracy_score(y_test, y_pred_ensemble))
print("Precision: ", precision_score(y_test, y_pred_ensemble))
print("Recall: ", recall_score(y_test, y_pred_ensemble))
print("F1 Score: ", f1_score(y_test, y_pred_ensemble))
print("AUC-ROC: ", roc_auc_score(y_test, y_pred_ensemble))

Accuracy:  0.9996770988574267
Precision:  0.9993717378697081
Recall:  1.0
F1 Score:  0.9996857702255204
AUC-ROC:  0.999668045554364


In [163]:
# Cross-validation
cross_val_model = clone(grid_search.best_estimator_)
cv_scores = cross_val_score(cross_val_model, X_train_vec, y_train, cv=StratifiedKFold(n_splits=5))  # Stratified K-Fold
print("Cross-validated accuracy: ", cv_scores.mean())

Cross-validated accuracy:  0.9998296762517225
