In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Load your dataset
data = pd.read_csv('/kaggle/input/financial-data/financial_regression.csv')

# Check if 'high-low' exists, otherwise create it
if 'high-low' not in data.columns:
    data['high-low'] = data['sp500 high'] - data['sp500 low']  # Adjust as per the columns you have

# Create the 'next_day_close' column by shifting the 'close' column by 1
data['next_day_close'] = data['sp500 close'].shift(-1)

# Drop the last row because the next day's close for it will be NaN
data = data.dropna(subset=['next_day_close'])

# Define features and target column
features = ['sp500 open', 'sp500 high', 'sp500 low', 'sp500 volume', 'high-low']  # Example features
target = 'next_day_close'  # Target variable

# Handling missing values by imputing with mean
imputer = SimpleImputer(strategy='mean')

# Apply imputation to the features (X) and target (y)
X = data[features]
y = data[target]

# Impute missing values in features (X) and target (y)
X_imputed = imputer.fit_transform(X)  # Impute missing values in X (features)
y_imputed = imputer.fit_transform(y.values.reshape(-1, 1))  # Impute missing values in y (target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_imputed, test_size=0.2, random_state=42)

# Initialize and train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation results
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer



# Check if 'high-low' exists, otherwise create it
if 'high-low' not in data.columns:
    data['high-low'] = data['sp500 high'] - data['sp500 low']  # Adjust as per the columns you have

# Create the 'next_day_close' column by shifting the 'close' column by 1
data['next_day_close'] = data['sp500 close'].shift(-1)

# Drop the last row because the next day's close for it will be NaN
data = data.dropna(subset=['next_day_close'])

# Define features and target column
features = ['sp500 open', 'sp500 high', 'sp500 low', 'sp500 volume', 'high-low']  # Example features
target = 'next_day_close'  # Target variable

# Handling missing values by imputing with mean
imputer = SimpleImputer(strategy='mean')

# Apply imputation to the features (X) and target (y)
X = data[features]
y = data[target]

# Impute missing values in features (X) and target (y)
X_imputed = imputer.fit_transform(X)  # Impute missing values in X (features)
y_imputed = imputer.fit_transform(y.values.reshape(-1, 1))  # Impute missing values in y (target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_imputed, test_size=0.2, random_state=42)

# Initialize a model (Ridge or Lasso)
model = Ridge()  # You can replace Ridge with Lasso or LinearRegression

# Set up the parameter grid for tuning
param_grid = {
    'alpha': [0.1, 1, 10, 100],  # Regularization strength for Ridge and Lasso
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score (MSE):", -grid_search.best_score_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation results
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Load your dataset
data = pd.read_csv('/kaggle/input/financial-data/financial_regression.csv')

# Check if 'high-low' exists, otherwise create it
if 'high-low' not in data.columns:
    data['high-low'] = data['sp500 high'] - data['sp500 low']  # Adjust as per the columns you have

# Create the 'next_day_close' column by shifting the 'close' column by 1
data['next_day_close'] = data['sp500 close'].shift(-1)

# Drop the last row because the next day's close for it will be NaN
data = data.dropna(subset=['next_day_close'])

# Define features and target column
features = ['sp500 open', 'sp500 high', 'sp500 low', 'sp500 volume', 'high-low']  # Example features
target = 'next_day_close'  # Target variable

# Handling missing values by imputing with mean
imputer = SimpleImputer(strategy='mean')

# Apply imputation to the features (X) and target (y)
X = data[features]
y = data[target]

# Impute missing values in features (X) and target (y)
X_imputed = imputer.fit_transform(X)  # Impute missing values in X (features)
y_imputed = imputer.fit_transform(y.values.reshape(-1, 1))  # Impute missing values in y (target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_imputed, test_size=0.2, random_state=42)

# Initialize models
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
xgb_model = XGBRegressor(n_estimators=100, random_state=42)

# Train the models
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test)
gb_pred = gb_model.predict(X_test)
xgb_pred = xgb_model.predict(X_test)

# Evaluate the models
rf_mse = mean_squared_error(y_test, rf_pred)
gb_mse = mean_squared_error(y_test, gb_pred)
xgb_mse = mean_squared_error(y_test, xgb_pred)

rf_r2 = r2_score(y_test, rf_pred)
gb_r2 = r2_score(y_test, gb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

# Print results
print(f"Random Forest MSE: {rf_mse}, R2: {rf_r2}")
print(f"Gradient Boosting MSE: {gb_mse}, R2: {gb_r2}")
print(f"XGBoost MSE: {xgb_mse}, R2: {xgb_r2}")

# Now let's apply ensemble predictions by averaging
ensemble_pred = (rf_pred + gb_pred + xgb_pred) / 3

# Evaluate ensemble model
ensemble_mse = mean_squared_error(y_test, ensemble_pred)
ensemble_r2 = r2_score(y_test, ensemble_pred)

print(f"Ensemble Model MSE: {ensemble_mse}, R2: {ensemble_r2}")
