In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
import numpy as np
from datetime import datetime
from geopy.distance import geodesic
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge

# Load the datasets
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

# Function to engineer features from the DATE column
def create_date_features(df):
    # Adjust the date format to match the actual format in your dataset
    df['DATE'] = pd.to_datetime(df['DATE'], format='%d-%m-%Y')  # Use dayfirst=True if date format varies
    df['DAY_OF_WEEK'] = df['DATE'].dt.dayofweek
    df['MONTH'] = df['DATE'].dt.month
    df['DAY_OF_YEAR'] = df['DATE'].dt.dayofyear
    df['YEAR'] = df['DATE'].dt.year
    df['IS_WEEKEND'] = df['DAY_OF_WEEK'].apply(lambda x: 1 if x >= 5 else 0)
    df['DAY_SIN'] = np.sin(2 * np.pi * df['DAY_OF_YEAR'] / 365.25)
    df['DAY_COS'] = np.cos(2 * np.pi * df['DAY_OF_YEAR'] / 365.25)
    df['MONTH_SIN'] = np.sin(2 * np.pi * df['MONTH'] / 12)
    df['MONTH_COS'] = np.cos(2 * np.pi * df['MONTH'] / 12)
    return df


# Function to calculate distance between locations
def calculate_distances(df):
    df['DISTANCE_AB'] = df.apply(lambda x: geodesic((x['LATITUDE_A'], x['LONGITUDE_A']),
                                                     (x['LATITUDE_B'], x['LONGITUDE_B'])).km, axis=1)
    df['DISTANCE_AC'] = df.apply(lambda x: geodesic((x['LATITUDE_A'], x['LONGITUDE_A']),
                                                     (x['LATITUDE_C'], x['LONGITUDE_C'])).km, axis=1)
    df['DISTANCE_BC'] = df.apply(lambda x: geodesic((x['LATITUDE_B'], x['LONGITUDE_B']),
                                                     (x['LATITUDE_C'], x['LONGITUDE_C'])).km, axis=1)
    df['ELEV_DIFF_AB'] = df['ELEVATION_A'] - df['ELEVATION_B']
    df['ELEV_DIFF_AC'] = df['ELEVATION_A'] - df['ELEVATION_C']
    df['ELEV_DIFF_BC'] = df['ELEVATION_B'] - df['ELEVATION_C']
    return df

# Function to calculate weather-based features
def calculate_weather_features(df):
    df['TEMP_DIFF_A'] = df['TMAX_A'] - df['TMIN_A']
    df['TEMP_DIFF_B'] = df['TMAX_B'] - df['TMIN_B']
    df['TEMP_DIFF_C'] = df['TMAX_C'] - df['TMIN_C']

    df['AVG_TEMP_A'] = df[['TMAX_A', 'TMIN_A']].mean(axis=1)
    df['AVG_TEMP_B'] = df[['TMAX_B', 'TMIN_B']].mean(axis=1)
    df['AVG_TEMP_C'] = df[['TMAX_C', 'TMIN_C']].mean(axis=1)

    df['CUMULATIVE_PRCP'] = df[['PRCP_A', 'PRCP_B', 'PRCP_C']].sum(axis=1)
    df['CUMULATIVE_SNWD'] = df[['SNWD_A', 'SNWD_B', 'SNWD_C']].sum(axis=1)

    df['TEMP_RANGE'] = df[['TMAX_A', 'TMAX_B', 'TMAX_C']].max(axis=1) - df[['TMIN_A', 'TMIN_B', 'TMIN_C']].min(axis=1)

    # Calculate anomalies
    long_term_avg_temp = df[['TAVG_A', 'TAVG_B', 'TAVG_C']].mean().mean()
    df['TEMP_ANOMALY_A'] = df['AVG_TEMP_A'] - long_term_avg_temp
    df['TEMP_ANOMALY_B'] = df['AVG_TEMP_B'] - long_term_avg_temp
    df['TEMP_ANOMALY_C'] = df['AVG_TEMP_C'] - long_term_avg_temp

    return df

# Function to create interaction features
def create_interaction_features(df):
    df['LAT_LONG_INTERACT_A'] = df['LATITUDE_A'] * df['LONGITUDE_A']
    df['LAT_LONG_INTERACT_B'] = df['LATITUDE_B'] * df['LONGITUDE_B']
    df['LAT_LONG_INTERACT_C'] = df['LATITUDE_C'] * df['LONGITUDE_C']

    df['ELEV_TEMP_INTERACT_A'] = df['ELEVATION_A'] * df['TAVG_A']
    df['ELEV_TEMP_INTERACT_B'] = df['ELEVATION_B'] * df['TAVG_B']
    df['ELEV_TEMP_INTERACT_C'] = df['ELEVATION_C'] * df['TAVG_C']

    df['PRCP_INTERACT_AB'] = df['PRCP_A'] * df['PRCP_B']
    df['PRCP_INTERACT_AC'] = df['PRCP_A'] * df['PRCP_C']
    df['PRCP_INTERACT_BC'] = df['PRCP_B'] * df['PRCP_C']

    return df

# Function to create statistical summary features
def create_statistical_features(df):
    df['MEAN_TEMP'] = df[['TAVG_A', 'TAVG_B', 'TAVG_C']].mean(axis=1)
    df['VAR_TEMP'] = df[['TAVG_A', 'TAVG_B', 'TAVG_C']].var(axis=1)
    df['MEDIAN_TEMP'] = df[['TAVG_A', 'TAVG_B', 'TAVG_C']].median(axis=1)

    df['MEAN_PRCP'] = df[['PRCP_A', 'PRCP_B', 'PRCP_C']].mean(axis=1)
    df['VAR_PRCP'] = df[['PRCP_A', 'PRCP_B', 'PRCP_C']].var(axis=1)
    df['MEDIAN_PRCP'] = df[['PRCP_A', 'PRCP_B', 'PRCP_C']].median(axis=1)

    return df

# Function to create lagged features
def create_lagged_features(df):
    df['TAVG_A_LAG1'] = df['TAVG_A'].shift(1)
    df['TAVG_B_LAG1'] = df['TAVG_B'].shift(1)
    df['TAVG_C_LAG1'] = df['TAVG_C'].shift(1)

    df['PRCP_A_LAG1'] = df['PRCP_A'].shift(1)
    df['PRCP_B_LAG1'] = df['PRCP_B'].shift(1)
    df['PRCP_C_LAG1'] = df['PRCP_C'].shift(1)

    df['ROLLING_MEAN_TEMP_A'] = df['TAVG_A'].rolling(window=3).mean()
    df['ROLLING_MEAN_TEMP_B'] = df['TAVG_B'].rolling(window=3).mean()
    df['ROLLING_MEAN_TEMP_C'] = df['TAVG_C'].rolling(window=3).mean()

    df['ROLLING_MEAN_PRCP_A'] = df['PRCP_A'].rolling(window=3).mean()
    df['ROLLING_MEAN_PRCP_B'] = df['PRCP_B'].rolling(window=3).mean()
    df['ROLLING_MEAN_PRCP_C'] = df['PRCP_C'].rolling(window=3).mean()

    return df

# Function to create clusters based on geographical data
def create_location_clusters(df, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    df['LOCATION_CLUSTER'] = kmeans.fit_predict(df[['LATITUDE_A', 'LONGITUDE_A', 'ELEVATION_A']])
    return df

# Define the ensemble of estimators
estimators = [
    RandomForestRegressor(n_estimators=10, random_state=0),
    GradientBoostingRegressor(n_estimators=10, random_state=0),
    ExtraTreesRegressor(n_estimators=10, random_state=0),
    BayesianRidge()
]

# Ensemble Iterative Imputer Function
def ensemble_iterative_imputer(X, estimators, n_iter=10):
    imputations = []
    for estimator in estimators:
        imputer = IterativeImputer(estimator=estimator, max_iter=n_iter, random_state=0)
        imputed_data = imputer.fit_transform(X)
        imputations.append(imputed_data)

    # Average the imputations
    averaged_imputations = np.mean(imputations, axis=0)
    return pd.DataFrame(averaged_imputations, columns=X.columns)

# Function to impute missing values, excluding non-numeric columns
def impute_missing_values(df):
    # Separate numeric and non-numeric data
    numeric_df = df.select_dtypes(include=[np.number])
    non_numeric_df = df.select_dtypes(exclude=[np.number])

    # Impute missing values in numeric data using Ensemble Iterative Imputer
    imputed_numeric_df = ensemble_iterative_imputer(numeric_df, estimators)

    # Concatenate the imputed numeric data with non-numeric data
    df = pd.concat([imputed_numeric_df, non_numeric_df], axis=1)

    return df



# Engineer the features
train_df = create_date_features(train_df)
test_df = create_date_features(test_df)

train_df = calculate_distances(train_df)
test_df = calculate_distances(test_df)

train_df = calculate_weather_features(train_df)
test_df = calculate_weather_features(test_df)

train_df = create_interaction_features(train_df)
test_df = create_interaction_features(test_df)

train_df = create_statistical_features(train_df)
test_df = create_statistical_features(test_df)

train_df = create_lagged_features(train_df)
test_df = create_lagged_features(test_df)

train_df = create_location_clusters(train_df)
test_df = create_location_clusters(test_df)

# Impute missing values
train_df = impute_missing_values(train_df)
test_df = impute_missing_values(test_df)

# Save the transformed datasets
train_df.to_csv('transformed_train.csv', index=False)
test_df.to_csv('transformed_test.csv', index=False)

# Optionally, display the first few rows of the transformed data
print(train_df.head())
print(test_df.head())


FileNotFoundError: [Errno 2] No such file or directory: '/content/train.csv'