In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('AirPassengers.csv', encoding="latin1")

# Display the first few rows of the dataset
print(data.head())

# 1. Data Cleaning
# Updated num_features to include only available numeric columns
num_features = ['#Passengers'] # Only '#Passengers' is a numeric feature
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# No categorical features available in this dataset based on the column list
cat_features = []
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps of Input Data
# Only include the numerical transformer as there are no categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        # Removed categorical transformer as there are no categorical features
        # ("cat", cat_transformer, cat_features)
    ],
    remainder='passthrough' # Keep other columns (like 'Month')
)
preprocessor.set_output(transform="pandas")

# Apply the transformations to the Input data
data_preprocessed = preprocessor.fit_transform(data)

# Generate more readable column names - this might need adjustment if column names are complex after preprocessing
# For this simple case with only numerical features and passthrough, original names might be kept or slightly modified
# Let's inspect the columns after preprocessing to decide on renaming
print(data_preprocessed.head())


# 3. Data Splitting
# Assuming '#Passengers' is the target variable for this dataset based on the column list
X = data_preprocessed.drop(columns=['num__#Passengers']) # Drop the scaled target variable
y = data_preprocessed['num__#Passengers'] # Use the scaled target variable


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the first few rows
print(X_train.head())
print(y_train.head())

     Month  #Passengers
0  1949-01          112
1  1949-02          118
2  1949-03          132
3  1949-04          129
4  1949-05          121
   num__#Passengers remainder__Month
0         -1.407779          1949-01
1         -1.357590          1949-02
2         -1.240483          1949-03
3         -1.265578          1949-04
4         -1.332496          1949-05
    remainder__Month
124          1959-05
31           1951-08
98           1957-03
36           1952-01
16           1950-05
124    1.168570
31    -0.680044
98     0.633225
36    -0.914258
16    -1.299037
Name: num__#Passengers, dtype: float64
