In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack

In [52]:
df = pd.read_csv("truthseeker_sentiments.csv")

In [53]:
# Step 2: Convert Categorical Variables
# Assuming 'text' contains textual data, we'll use CountVectorizer or TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)  # You can also use CountVectorizer() instead
X_text_features = vectorizer.fit_transform(df['text'])

In [54]:
# Combine the text features with other numerical features using hstack (sparse matrix concatenation)
numerical_features = df.drop(['BinaryNumTarget', 'text'], axis=1)
X_numerical = numerical_features.values

In [55]:
X_processed = hstack([X_numerical, X_text_features])

In [56]:
# Step 3: Scale Numerical Features with `with_mean=False`
scaler = StandardScaler(with_mean=False)  # Avoid centering for sparse data
X_scaled = scaler.fit_transform(X_processed)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df['BinaryNumTarget'], test_size=1/4, random_state=42)

In [39]:
# Step 3: Train individual classifiers
#nb_classifier = GaussianNB()
pa_classifier = PassiveAggressiveClassifier()

In [68]:
nb_classifier = SGDClassifier(loss='log', alpha=0.0001, max_iter=1000, random_state=42)

In [69]:
batch_size = 1000  # You can adjust this based on your memory availability
n_samples = X_train.shape[0]

In [70]:
for i in range(0, n_samples, batch_size):
    X_batch = X_train[i:i + batch_size].toarray()
    y_batch = y_train[i:i + batch_size]
    nb_classifier.partial_fit(X_batch, y_batch, classes=np.unique(y_train))

In [49]:
pa_classifier.fit(X_train, y_train)

PassiveAggressiveClassifier()

In [62]:
# Step 7: Build the Neural Network model
nn_model = Sequential()
nn_model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
nn_model.add(Dense(64, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))

In [63]:
nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [64]:
# Step 8: Train the Neural Network in batches
batch_size_nn = 32  # Adjust this based on your memory availability
n_samples_nn = X_train.shape[0]
num_epochs = 10

In [65]:
for epoch in range(num_epochs):
    for i in range(0, n_samples_nn, batch_size_nn):
        X_batch_nn = X_train[i:i + batch_size_nn].toarray()  # Convert to dense array
        y_batch_nn = y_train[i:i + batch_size_nn]
        nn_model.train_on_batch(X_batch_nn, y_batch_nn)

In [66]:
#nn_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)



InvalidArgumentError: Graph execution error:

TypeError: 'SparseTensor' object is not subscriptable
Traceback (most recent call last):

  File "C:\Users\natas\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 269, in __call__
    return func(device, token, args)

  File "C:\Users\natas\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 147, in __call__
    outputs = self._call(device, args)

  File "C:\Users\natas\anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 154, in _call
    ret = self._func(*args)

  File "C:\Users\natas\anaconda3\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "C:\Users\natas\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 511, in py_method
    return [slice_array(inp) for inp in flat_inputs]

  File "C:\Users\natas\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 511, in <listcomp>
    return [slice_array(inp) for inp in flat_inputs]

  File "C:\Users\natas\anaconda3\lib\site-packages\keras\engine\data_adapter.py", line 507, in slice_array
    return training_utils.slice_arrays(

  File "C:\Users\natas\anaconda3\lib\site-packages\keras\engine\training_utils.py", line 47, in slice_arrays
    entries = [[x[i : i + 1] for i in indices] for x in arrays]

  File "C:\Users\natas\anaconda3\lib\site-packages\keras\engine\training_utils.py", line 47, in <listcomp>
    entries = [[x[i : i + 1] for i in indices] for x in arrays]

  File "C:\Users\natas\anaconda3\lib\site-packages\keras\engine\training_utils.py", line 47, in <listcomp>
    entries = [[x[i : i + 1] for i in indices] for x in arrays]

TypeError: 'SparseTensor' object is not subscriptable


	 [[{{node EagerPyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_33907]

In [71]:
# Step 5: Combine the predictions
nb_predictions = nb_classifier.predict_proba(X_train)[:, 1]
pa_predictions = pa_classifier.decision_function(X_train)
nn_predictions = nn_model.predict(X_train).flatten()



In [72]:
# Create a new training set with the predictions of the individual classifiers
ensemble_train = pd.DataFrame({
    'NaiveBayes': nb_predictions,
    'PassiveAggressive': pa_predictions,
    'NeuralNetwork': nn_predictions
})

In [73]:
# Step 6: Build an ensemble model
ensemble_model = LogisticRegression()
ensemble_model.fit(ensemble_train, y_train)

LogisticRegression()

In [75]:
# Step 7: Evaluate the ensemble
nb_test_predictions = nb_classifier.predict_proba(X_test)[:, 1]
pa_test_predictions = pa_classifier.decision_function(X_test)
nn_test_predictions = nn_model.predict(X_test).flatten()



In [76]:
ensemble_test = pd.DataFrame({
    'NaiveBayes': nb_test_predictions,
    'PassiveAggressive': pa_test_predictions,
    'NeuralNetwork': nn_test_predictions
})

In [77]:
ensemble_test_predictions = ensemble_model.predict(ensemble_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_test_predictions)
print("Ensemble Accuracy:", ensemble_accuracy)

Ensemble Accuracy: 0.9939486078817147
