In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shenba/time-series-datasets")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'time-series-datasets' dataset.
Path to dataset files: /kaggle/input/time-series-datasets


In [24]:
# ================================
# Electric Production Data Preprocessing
# ================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split

# Load dataset (adjust the path as per your Colab output)
df = pd.read_csv("/root/.cache/kagglehub/datasets/shenba/time-series-datasets/versions/1/Electric_Production.csv")

print("---- Dataset Info ----")
print(df.info(), "\n")

print("---- Missing Values ----")
print(df.isnull().sum(), "\n")

# Handle missing data (if any)
df.fillna(df.mean(numeric_only=True), inplace=True)

# Add derived time features (if needed)
# You already have Year, Month, Quarter — so we’ll keep them as they are.

# Check for categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols, "\n")

# Encode categorical variables (if any exist)
if len(categorical_cols) > 0:
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded = encoder.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))
    df = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)
else:
    print("No categorical columns to encode.\n")

# Feature Scaling (Standardization)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
print("After Scaling:\n", scaled_df.head(), "\n")

# Dimensionality Reduction (PCA)
pca = PCA(n_components=2)
pca_features = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(pca_features, columns=['PC1', 'PC2'])
print("Explained Variance Ratio (PCA):", pca.explained_variance_ratio_, "\n")

# Feature Selection (SelectKBest)
X = df.drop(columns=['IPG2211A2N'])
y = df['IPG2211A2N']

selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X, y)
scores = pd.DataFrame({'Feature': X.columns, 'Score': selector.scores_})
print("Feature Selection Scores:\n", scores.sort_values(by='Score', ascending=False), "\n")

# Summary of Transformations
print("""
==================== SUMMARY ====================
Missing values handled using mean imputation (if any).
No categorical encoding was needed (dataset fully numeric).
Feature scaling applied using StandardScaler (Z-score normalization).
PCA reduced dataset to 2 principal components for visualization.
Feature selection (SelectKBest) showed importance scores of numeric features.
==================================================
""")


---- Dataset Info ----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   DATE        397 non-null    object 
 1   IPG2211A2N  397 non-null    float64
dtypes: float64(1), object(1)
memory usage: 6.3+ KB
None 

---- Missing Values ----
DATE          0
IPG2211A2N    0
dtype: int64 

Categorical columns: Index(['DATE'], dtype='object') 

After Scaling:
    IPG2211A2N  DATE_1/1/1986  DATE_1/1/1987  DATE_1/1/1988  DATE_1/1/1989  \
0   -1.063349      -0.050252      -0.050252      -0.050252      -0.050252   
1   -1.182632      -0.050252      -0.050252      -0.050252      -0.050252   
2   -1.717612      -0.050252      -0.050252      -0.050252      -0.050252   
3   -2.041574      -0.050252      -0.050252      -0.050252      -0.050252   
4   -2.181881      -0.050252      -0.050252      -0.050252      -0.050252   

   DATE_1/1/1990  DATE_1/1/1991  DATE_1/1/19