In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'time-series-classification-part-2new:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F71247%2F7784049%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240420%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240420T210157Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D365275dce04b25cee90aca6409df0c9b4ce4dbdf6548aeb92ee365040276193d3db36ca7209f718ab327aaabb502628cb6690a73d10a99dd65a89abeae380aee84bd827090d74db958d944a2b7ad58ef7b56d6ecb5c4448d2f0abc071da45f5bcc9dc33cd055cdd1404a47907b0aa5759a238827f18c8c30013973ec9cf20af5636fbb3c833e8bf179f120086823dece25b43d64fed22a6f6aadfbb2b99007c5dffa3a648f81c77517dc8d767ab5318e91657c677ef601860e6690bef5eb2a57d648fde113af63368525ef77bcddfb5fc40806f3a6e3f15dca89d5507049ca47acd9ead8d69ced86c278eb992e27ba7f5c5fed9e6c64df2b6d91c87a6f0a0d6d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading time-series-classification-part-2new, 106822 bytes compressed
Downloaded and uncompressed: time-series-classification-part-2new
Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/time-series-classification-part-2new/exchange_rate_with_missing.csv


In [None]:
import pandas as pd

#  dataset loading
df = pd.read_csv('/kaggle/input/time-series-classification-part-2new/exchange_rate_with_missing.csv')

# Converting the date column is in datetime format and set it as index
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)



In [None]:
# Check for missing values before-fill
missing_values_count_post = df.isnull().sum()
print(missing_values_count_post)

0     4157
1     4175
2     4161
3     4162
4     4022
5     4096
6     4297
OT    4231
dtype: int64


In [None]:
#Applying interpolation to handle missing values
df.interpolate(method='spline', order=5, inplace=True)

In [None]:
#handling missing values
# Fill forward missing values
#df.interpolate(method='linear',inplace=True)
#df.fillna(method='ffill', inplace=True)
#df.fillna(method='bfill', inplace=True)

In [None]:
# Check for missing values post-fill
missing_values_count_post = df.isnull().sum()
print(missing_values_count_post)

0      0
1      0
2      0
3      0
4      0
5      0
6      0
OT    18
dtype: int64


In [None]:
df.to_csv('output.csv', index=True)


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Number of lags to include
n_lags = 6
# Create lag features
for lag in range(1, n_lags + 1):
    df[f'lag_{lag}'] = df['6'].shift(lag)

# Drop the initial rows that now contain NaN values due to shifting
df.dropna(inplace=True)

# Define the new feature set including the lags
features = ['0','1',  '2', '3','4', '5', 'OT'] + [f'lag_{i}' for i in range(1, n_lags + 1)]
X = df[features]
Y = df['6']

# Convert Y to a 1D array
Y = Y.ravel()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

# Train the model with the new features
model = RandomForestRegressor(n_estimators=382,max_depth=(28), random_state=42)
model.fit(X_train, y_train)



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300,400],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score (MAE):", -grid_search.best_score_)

# Use the best estimator to make predictions
best_rf = grid_search.best_estimator_
predictions = best_rf.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print("Test MAE with best estimator:", mae)


'from sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import mean_absolute_error\n\n# Define the parameter grid\nparam_grid = {\n    \'n_estimators\': [100, 200, 300],\n    \'max_depth\': [None, 10, 20, 30],\n    \'min_samples_split\': [2, 10, 20],\n    \'min_samples_leaf\': [1, 2, 4],\n    \'max_features\': [\'auto\', \'sqrt\', \'log2\']\n}\n\n# Initialize the RandomForestRegressor\nrf = RandomForestRegressor(random_state=42)\n\n# Initialize the GridSearchCV object\ngrid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring=\'neg_mean_absolute_error\', verbose=2, n_jobs=-1)\n\n# Fit the grid search to the data\ngrid_search.fit(X_train, y_train)\n\n# Print the best parameters and best score\nprint("Best parameters:", grid_search.best_params_)\nprint("Best score (MAE):", -grid_search.best_score_)\n\n# Use the best estimator to make predictions\nbest_rf = grid_search.best_estimator_\npredictions = best_rf.predict(X_test)\nmae = mean_absolute_error(y_t

In [None]:

# Take the last row from the dataframe as it includes all necessary lagged features
current_input = df[features].iloc[-1].to_numpy().reshape(1, -1)

forecast = []
for _ in range(100):
    # Predict the next value
    next_value = model.predict(current_input)[0]
    forecast.append(next_value)

    # Update the current input with the new predicted value
    # Roll the features to the left and insert the new prediction as the most recent lag
    current_input = np.roll(current_input, -1)
    current_input[0, -1] = next_value

print(forecast)


In [None]:
import pandas as pd


# Créer un DataFrame avec 'Id' et 'Prediction' à partir de 0
predictions_df = pd.DataFrame({'Id': range(100), 'Prediction': forecast})

# Enregistrer le DataFrame dans un fichier CSV
predictions_df.to_csv('predictions.csv', index=False)