In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
import numpy as np
import pandas as pd


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


def pos_tagging(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    return [tag for word, tag in tags]

data['pos_tags'] = data['cleaned_message'].apply(lambda x: pos_tagging(x))


tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(data['cleaned_message']).toarray()


data['word_count'] = data['cleaned_message'].apply(lambda x: len(x.split()))
data['contains_numbers'] = data['cleaned_message'].apply(lambda x: int(any(c.isdigit() for c in x)))

feature_df = pd.DataFrame(tfidf_features, columns=tfidf_vectorizer.get_feature_names_out())
feature_df['word_count'] = data['word_count']
feature_df['contains_numbers'] = data['contains_numbers']


feature_df['sentiment'] = data['sentiment']
feature_df['hour'] = data['hour']
feature_df['day_of_week'] = data['day_of_week']

scaler = StandardScaler()
scaled_features = scaler.fit_transform(feature_df)


np.random.seed(42)
feature_df['stock_movement'] = np.random.choice([0, 1], size=len(feature_df))


X = feature_df.drop('stock_movement', axis=1)
y = feature_df['stock_movement']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score

lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100)
lgbm_model.fit(X_train, y_train)


y_pred = lgbm_model.predict(X_test)
y_prob = lgbm_model.predict_proba(X_test)[:, 1]


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Down', 'Up'], yticklabels=['Down', 'Up'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

from sklearn.model_selection import RandomizedSearchCV


param_dist = {
    'num_leaves': [31, 40, 50],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}


random_search = RandomizedSearchCV(estimator=lgb.LGBMClassifier(), param_distributions=param_dist, n_iter=50, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)

# Evaluate the best model
best_lgbm_model = random_search.best_estimator_
y_pred_best = best_lgbm_model.predict(X_test)

# Accuracy and other metrics for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Improved Accuracy:", accuracy_best)
