In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratary Data Analysis

In [None]:
train_data = pd.read_csv('/kaggle/input/directional-forecasting-in-cryptocurrencies/train.csv')
test_data = pd.read_csv('/kaggle/input/directional-forecasting-in-cryptocurrencies/test.csv')

In [None]:
train_data.columns

In [None]:
test_data.columns

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.isnull().any() # No missing values

In [None]:
#test_data.isnull().any() # No missing values

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting the distribution of numerical columns
plt.figure(figsize=(12, 10))
train_data.hist(bins=50, figsize=(12, 10))
plt.tight_layout()
plt.show()

In [None]:
# No missing values the time serie is complete
train_data['timestamp'].plot()

## corr analysis

In [None]:
"""
# Compute correlation matrix
corr = train_data.corr()

# Plotting the heatmap of correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.show()
"""


In [None]:
train_data.columns

# columns 'open', 'high', 'low', 'close' are the same

In [None]:
# Pairplot to visualize relationships between numerical columns and target

#sns.pairplot(train_data[['open', 'target']])
#plt.show()

In [None]:
"""

sns.pairplot(train_data[['volume','quote_asset_volume', 
                         'number_of_trades', 'taker_buy_base_volume', 
                         'taker_buy_quote_volume', 'target']])
"""

## Look at the important features via a random forest

In [None]:
"""
Look at the imporatn features
# Via gradient boosting
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


X = train_data.drop(columns=['target', 'timestamp'])
y = train_data['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2)

model = xgb.XGBClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Plot feature importance
xgb.plot_importance(model, max_num_features=10, importance_type='gain')  # or 'weight', 'cover'
plt.title("XGBoost Feature Importance")
plt.tight_layout()
plt.show()
"""



# Preprocessing of the Data

In [None]:
# Remove the columns with 
from sklearn.preprocessing import StandardScaler
# 1 Remove the columns  'high', 'low', 'close' (=='open')
def preprocess(df):
    df.drop(columns=['open', 'high', 'low'], inplace=True)
    
    # 2 Create new columns based of the timestamp
    
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['time_since_start'] = (df['datetime'] - df['datetime'].min()).dt.total_seconds()
    df.drop(columns=['datetime', 'timestamp'], inplace=True)

    # 3 Lets add normlization for each columns 
    num_columns = ['close', 'volume', 'quote_asset_volume', 'number_of_trades',
       'taker_buy_base_volume', 'taker_buy_quote_volume', 'hour',
       'minute', 'dayofweek', 'time_since_start']
    scaler = StandardScaler()
    df[num_columns] = scaler.fit_transform(df[num_columns])

    return df

# Model training and Results

## Linear regression

In [None]:
# NOTE very bad
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score

# Load data
train_data = pd.read_csv('/kaggle/input/directional-forecasting-in-cryptocurrencies/train.csv')
test_data = pd.read_csv('/kaggle/input/directional-forecasting-in-cryptocurrencies/test.csv')

# Preprocess the data (assuming the preprocessing function is defined)
train_df = preprocess(train_data)

# Select features and target
X = train_df.drop(columns=['target', 'timestamp'])  # Features (excluding target and timestamp)
y = train_data['target']  # Target variable

# Train-test split (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize Linear Regression model
model = LinearRegression()

# Fit model on training data
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Convert predictions to binary (0 or 1) since Linear Regression gives continuous outputs
y_pred_binary = (y_pred > 0.5).astype(int)  # Assuming the threshold for binary classification is 0.5

# Evaluate using F1-score
f1 = f1_score(y_val, y_pred_binary)
print(f"F1-Score (Linear Regression): {f1:.4f}")

"""


## Xgboost

In [None]:


"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score

# Preprocess the data 
train_data = pd.read_csv('/kaggle/input/directional-forecasting-in-cryptocurrencies/train.csv')

train_df = preprocess(train_data).copy()


# Assuming `train_data` is your dataset
X = train_df.drop(columns=['target'])  # Features (excluding target and timestamp)
y = train_data['target']  # Target variable


# Train-test split (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


import xgboost as xgb
from sklearn.metrics import f1_score

# Initialize XGBoost model
model = xgb.XGBClassifier(n_estimators=500, 
                          max_depth=15, 
                          random_state=42, 
                          scale_pos_weight=1)

# Fit model on training data
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Evaluate using F1-score
f1 = f1_score(y_val, y_pred)
print(f"F1-Score (XGBoost): {f1:.4f}")
"""

In [None]:
# Train model with full dataset and predict the 



import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score

import xgboost as xgb


# Preprocess the data 
train_data = pd.read_csv('/kaggle/input/directional-forecasting-in-cryptocurrencies/train.csv')

train_df = preprocess(train_data)

# Assuming `train_data` is your dataset
X = train_df.drop(columns=['target'])  # Features (excluding target and timestamp)
y = train_data['target'] 

# Initialize XGBoost model
model = xgb.XGBClassifier(n_estimators=500, 
                          max_depth=15, 
                          random_state=42, 
                          scale_pos_weight=1)

# Fit model on training data
model.fit(X, y)


# Inference part
test_data = pd.read_csv('/kaggle/input/directional-forecasting-in-cryptocurrencies/test.csv')
test_df = preprocess(test_data)

X_test = test_df.drop(columns=['row_id'])
y_test = model.predict(X_test)


# Save model and perform inference
output = pd.DataFrame({
    'row_id': test_data['row_id'],
    'target': y_test
})
output.to_csv('submission.csv', index=False)
print('model saved')