In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load data from CSV file
file_path = '/content/drive/MyDrive/data_with_predictions.csv'
df = pd.read_csv(file_path)

# Remove rows where any column has NaN values
df = df.dropna()

# Define features and target for Gaussian Naive Bayes
X =df[['Open ','Shares Traded ', 'Turnover']]
y = df['Prize Action']

# Scale the features for Gaussian Naive Bayes
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets for Gaussian Naive Bayes
X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize Gaussian Naive Bayes classifier
clf_nb = BernoulliNB()

# Fit the Gaussian Naive Bayes model
clf_nb.fit(X_train_nb, y_train_nb)
y_pred_nb=clf_nb.predict(X_test_nb)
# Predict probabilities on the test set
y_pred_prob_nb = clf_nb.predict_proba(X_test_nb)[:, 1]  # Probability of the positive class
accuracy = accuracy_score(y_test_nb, y_pred_nb)
precision = precision_score(y_test_nb, y_pred_nb)
recall = recall_score(y_test_nb, y_pred_nb)
f1= f1_score(y_test_nb, y_pred_nb)
# conf_matrix_rf = confusion_matrix(y_test_rf, y_pred_rf)

# Print evaluation metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
# print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 55.62%
Precision: 56.35%
Recall: 42.37%
F1 Score: 48.37%


In [85]:
print(len(X_test_nb))

5464


In [87]:
X_lr = pd.DataFrame({
    'GNB_Prob': y_pred_nb,
    'Sentiment': X_test_nb[:, -1]
})

# Split data into training and testing sets for Logistic Regression
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr, y_test_nb, test_size=0.2, random_state=42)

# Initialize Logistic Regression classifier
clf_lr = LogisticRegression()

# Fit the Logistic Regression model
clf_lr.fit(X_train_lr, y_train_lr)

# Predict on the test set
y_pred_lr = clf_lr.predict(X_test_lr)

# Evaluate the Logistic Regression model
accuracy_lr = accuracy_score(y_test_lr, y_pred_lr)
precision_lr = precision_score(y_test_lr, y_pred_lr)
recall_lr = recall_score(y_test_lr, y_pred_lr)
f1_lr = f1_score(y_test_lr, y_pred_lr)
conf_matrix_lr = confusion_matrix(y_test_lr, y_pred_lr)

# Print evaluation metrics
print("Logistic Regression Classifier Metrics:")
print(f"Accuracy: {accuracy_lr * 100:.2f}%")
print(f"Precision: {precision_lr * 100:.2f}%")
print(f"Recall: {recall_lr * 100:.2f}%")
print(f"F1 Score: {f1_lr * 100:.2f}%")
print(f"Confusion Matrix:\n{conf_matrix_lr}")

Logistic Regression Classifier Metrics:
Accuracy: 56.08%
Precision: 55.13%
Recall: 41.35%
F1 Score: 47.25%
Confusion Matrix:
[[398 175]
 [305 215]]


In [88]:

# Add predicted probabilities and 'Sentiment' as features for Logistic Regression
X_lr = pd.DataFrame({
    'RF_Prob': y_pred_prob_rf,
    'Sentiment':  X_test_rf[:, -1]
})

# Split data into training and testing sets for Logistic Regression
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr, y_test_rf, test_size=0.2, random_state=42)

# Initialize Logistic Regression classifier
clf_lr = LogisticRegression()

# Fit the Logistic Regression model
clf_lr.fit(X_train_lr, y_train_lr)

# Predict on the test set
y_pred_lr = clf_lr.predict(X_test_lr)

# Evaluate the model
accuracy_lr = accuracy_score(y_test_lr, y_pred_lr)
precision_lr = precision_score(y_test_lr, y_pred_lr)
recall_lr = recall_score(y_test_lr, y_pred_lr)
f1_lr = f1_score(y_test_lr, y_pred_lr)
conf_matrix_lr = confusion_matrix(y_test_lr, y_pred_lr)

# Print evaluation metrics
print("Logistic Regression Classifier Metrics:")
print(f"Accuracy: {accuracy_lr * 100:.2f}%")
print(f"Precision: {precision_lr * 100:.2f}%")
print(f"Recall: {recall_lr * 100:.2f}%")
print(f"F1 Score: {f1_lr * 100:.2f}%")
print(f"Confusion Matrix:\n{conf_matrix_lr}")

Logistic Regression Classifier Metrics:
Accuracy: 59.19%
Precision: 54.37%
Recall: 88.46%
F1 Score: 67.35%
Confusion Matrix:
[[187 386]
 [ 60 460]]
