# Logistic Regression

In [None]:
import pandas as pd

In [None]:

training = pd.read_csv("balanced_training_embed.csv")

valid = pd.read_csv("valid_embed.csv")

# Preprocessing

Creating keywords and determining if they have them to use as predictors. We also converted our multi-classification into binary for simplicity in predicting using a logistic regression model.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Convert original labels:
#  LABEL_0: "Bearish,
#  LABEL_1: "Bullish,
#  LABEL_2: "Neutral

# Into these labels:
# Label 0: Bearish/Bullish
# Label 1: Neutral

expanded_keywords = {
    'beat': ['beat', 'beating', 'beats', 'beaten', 'beated', 'beat.'],
    'buy': ['buy', 'sell', 'buying', 'purchase', 'bought', 'buys'],
    'cut': ['cut', 'cutting', 'cuts', 'Cut', 'trimmed', 'cut.'],
    'drop': ['drop', 'drops', 'dropping', 'dropped', 'Drop', 'drop-'],
    'gain': ['gain', 'gaining', 'gained', 'gains', 're-gain', 'Gain'],
    'miss': ['miss', 'missed', 'forget', 'misssed', 'miss.I', 'miss-out'],
    'sell': ['sell', 'buy', 'resell', 're-sell', 'selling', 'selll'],
    'strong': ['strong', 'stong', 'strongest', 'weak', 'stronger', 'storng'],
    'upgrade': ['upgrade', 'upgrades', 'upgrading', 'updgrade', 'ugrade', 'uprade'],
    'weak': ['weak', 'weaker', 'weakest', 'feeble', 'weak.The', 'strong']
}

# Flatten dictionary into a list to find keywords easier
# For each word, iterate over each sublist and get each word in the sublist
all_keywords = {w for words in  expanded_keywords.values() for w in words}

def find_keywords(text):
     # Convert NaN or non-string to empty string
    text = str(text) if pd.notnull(text) else ""
    words_list = text.split()  # split the text into a list of words
    for word in words_list:
        if word in all_keywords:
            return 1
    return 0

# Add the 'has_keyword' column to both training and valid dataframes
training["has_keyword"] = training["clean_text"].apply(find_keywords)
valid["has_keyword"] = valid["clean_text"].apply(find_keywords)

# # Convert Label 2 (Neutral) into 1, then label 0 and label 1 will become 0
training['binary_label'] = (training['label'] == 2).astype(int)
valid['binary_label'] = (valid['label'] == 2).astype(int)

# # Remove unnecessary columns (text, clean text, word_count, and label)
training = training[['has_keyword', 'binary_label']]
valid = valid[['has_keyword', 'binary_label']]

X_train = training[['has_keyword']]
y_train = training['binary_label']

X_test = training[['has_keyword']]
y_test = training['binary_label']


#**Training the Logistic Regression Model**


Using the predictor "has_keyword" to predict if a tweet or headline is Neutral or Not Neutral.

In [None]:
lr_all = LogisticRegression(solver='liblinear')
lr_all.fit(X=X_train,
           y=y_train)
lr_all.intercept_, lr_all.coef_

print("LR Intercept: ", lr_all.intercept_[0])
print("LR has_keyword Coefficient: ", lr_all.coef_[0][0])

LR Intercept:  -0.37583306807178063
LR has_keyword Coefficient:  -1.0159869134906263


#**Using CV for Model Evaluation**



In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

X = X_train
y = y_train
cross_val_score(lr_all, X, y, cv=5, scoring='roc_auc')

skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

i = 1
for train_index, test_index in skfolds.split(X, y):
    clone_lr = clone(lr_all)

    X_train_folds = X.iloc[train_index]
    y_train_folds = y.iloc[train_index]
    X_test_fold   = X.iloc[test_index]
    y_test_fold   = y.iloc[test_index]

    clone_lr.fit(X_train_folds, y_train_folds)

    # use probabilities for AUC; 0.5 cutoff for accuracy
    y_prob = clone_lr.predict_proba(X_test_fold)[:, 1]
    y_pred = (y_prob >= 0.5).astype(int)

    auc_sample = metrics.roc_auc_score(y_test_fold, y_prob)
    acc_sample = metrics.accuracy_score(y_test_fold, y_pred)

    print(f"Fold {i}: AUC={auc_sample:.3f}  Accuracy={acc_sample:.3f}")
    i += 1


Fold 1: AUC=0.572  Accuracy=0.627
Fold 2: AUC=0.554  Accuracy=0.627
Fold 3: AUC=0.558  Accuracy=0.627
Fold 4: AUC=0.554  Accuracy=0.627
Fold 5: AUC=0.569  Accuracy=0.627


#**Calculating Evaluation Metrics On Test Data**


In [None]:
# Creating table with label, prediction probability, and the predicted label

proba = lr_all.predict_proba(X_test)

pred_val_sample = pd.DataFrame({
    'label': y_test,
    'lr_predict_class0_proba': proba[:,0],
    'lr_predict_class1_proba': proba[:,1],
    'lr_predict_binary': lr_all.predict(X_test)
})

pred_val_sample.head()

Unnamed: 0,label,lr_predict_class0_proba,lr_predict_class1_proba,lr_predict_binary
0,0,0.592868,0.407132,0
1,0,0.592868,0.407132,0
2,0,0.800883,0.199117,0
3,0,0.800883,0.199117,0
4,0,0.800883,0.199117,0


In [None]:
from sklearn import metrics
import pandas as pd

# Confusion matrix
conf_lr = metrics.confusion_matrix(
    y_true=pred_val_sample['label'],
    y_pred=pred_val_sample['lr_predict_binary']
)

conf_lr_df = pd.DataFrame(
    conf_lr,
    index=['Actual: 0 (Not Neutral)', 'Actual: 1 (Neutral)'],
    columns=['Predicted: 0 (Not Neutral)', 'Predicted: 1 (Neutral)']
)

print("Confusion Matrix:\n", conf_lr_df, '\n')

# Prediction Accuracy
accuracy = metrics.accuracy_score(
    y_true=pred_val_sample['label'],
    y_pred=pred_val_sample['lr_predict_binary']
)
print("Prediction Accuracy:", accuracy)

# Recall Score (Sensitivity / True Positive Rate)
recall = metrics.recall_score(
    y_true=pred_val_sample['label'],
    y_pred=pred_val_sample['lr_predict_binary']
)
print("Recall Score:", recall)

# Specificity Score (True Negative Rate)
specificity = conf_lr[0, 0] / conf_lr[0, :].sum()
print("Specificity Score:", specificity)

# Prediction Error
prediction_error = 1 - accuracy
print("Prediction Error:", prediction_error)


Confusion Matrix:
                          Predicted: 0 (Not Neutral)  Predicted: 1 (Neutral)
Actual: 0 (Not Neutral)                        3365                       0
Actual: 1 (Neutral)                            2000                       0 

Prediction Accuracy: 0.6272134203168686
Recall Score: 0.0
Specificity Score: 1.0
Prediction Error: 0.37278657968313145


#**Predicted Probability Histogram**


In [None]:
# Predicted Probability Histrogram
import plotly.express as px


px.histogram(
    pred_val_sample,
    x='lr_predict_class1_proba',
    color='label',
    nbins=20,
    opacity=0.5,
    barmode='overlay',
    title='Distribution of Predicted Probabilities',
    labels={
        'lr_predict_class1_proba': 'LR Predicted Probabilities'
    },
    color_discrete_map={
        0: 'red',   # class 0
        1: 'blue'   # class 1
    }
)

#**Calculating ROC Curve and AUC On Test Data**


In [None]:
from sklearn import metrics
import pandas as pd
import plotly.express as px

lr_fpr_sample, lr_tpr_sample, lr_thresholds_sample = metrics.roc_curve(
    y_true=pred_val_sample['label'],
    y_score=pred_val_sample['lr_predict_class1_proba']  # probabilities!
)

roc_lr_sample = pd.DataFrame({
    'False Positive Rate': lr_fpr_sample,
    'True Positive Rate': lr_tpr_sample,
    'Model': 'Logistic Regression'
})

fig = px.line(
    roc_lr_sample,
    x='False Positive Rate',
    y='True Positive Rate',
    width=700,
    height=500,
    title='ROC Curve'
)
fig.show()

lr_auc_sample = metrics.roc_auc_score(
    y_true=pred_val_sample['label'],
    y_score=pred_val_sample['lr_predict_class1_proba']
)

print('Logistic Regression AUC:', round(lr_auc_sample, 3))


Logistic Regression AUC: 0.562
