<a href="https://colab.research.google.com/github/SatoJin02/BDA_course25/blob/main/s1290074_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Step 0: Library Imports and Data Load

We use only lightweight, built-in scientific packages: pandas, numpy, and scikit-learn.
IterativeImputer is experimental, so we explicitly enable it.

In [5]:
# ===============================================
# 必要ライブラリのインポート
# ===============================================
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer  # ← これを追加！
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")

In [6]:
# ===============================================
# Load Dataset
# ===============================================
file_path = "/content/ETL_DATA_new.csv"
Data = pd.read_csv(file_path)

print("Data Loaded Successfully!")
print("Data shape:", Data.shape)
print("\nPreview:")
display(Data.head())

Data Loaded Successfully!
Data shape: (46005, 1833)

Preview:


Unnamed: 0,Unnamed: 1,TimeStamp,Point(139.0794379 36.3727776),Point(139.1051411 36.3963822),Point(139.0960211 36.4047323),Point(139.0428727 36.3816035),Point(138.9955116 36.33801589999999),Point(139.342672 36.4105658),Point(139.3526243 36.3695416),Point(139.1945766 36.31351160000001),...,Point(139.9418164 36.7656467),Point(140.0549894 36.9688923),Point(139.8775674 36.3847082),Point(139.9101767 36.4393022),Point(139.9074816 36.4445767),Point(140.0934838 36.4673588),Point(139.7422865 36.2305774),Point(139.7151723 36.822353),Point(140.1510903 36.6598314),Unnamed: 1832
0,0,2018-01-01 01:00:00,,,5.0,13.0,18.0,20.0,,,...,,,,,,6.0,,,4.0,
1,1,2018-01-01 02:00:00,,,11.0,12.0,22.0,15.0,,,...,,6.0,,,,9.0,,,5.0,
2,2,2018-01-01 03:00:00,,,7.0,12.0,19.0,16.0,,,...,,0.0,,,,10.0,,,6.0,
3,3,2018-01-01 04:00:00,,,5.0,11.0,16.0,11.0,,,...,,2.0,,,,11.0,,,11.0,
4,4,2018-01-01 05:00:00,,,6.0,11.0,10.0,8.0,,,...,,4.0,,,,8.0,,,6.0,


#Step 1: Data Selection

Goal:

1. Keep columns that contain sensor-location information (e.g., Point, lat, lon).

2. Remove columns where more than 80 % of the cells are missing.

3. Retain only numeric columns for later imputation.

In [7]:
# ===============================================
# SELECTION → Target Data Creation
# ===============================================


location_keywords = ['lat', 'lon', 'location', 'station', 'point']
keep_cols = [c for c in Data.columns if any(k in c.lower() for k in location_keywords)]
if keep_cols:
    Data = Data[keep_cols + [c for c in Data.columns if c not in keep_cols]]

Data = Data.loc[:, Data.isnull().mean() < 0.8]
print("\nAfter SELECTION:", Data.shape)


After SELECTION: (46005, 1129)


#Step 2: Pre-processing and Missing-Value Imputation
(a) Replace abnormal values

Values outside 0–250 are considered abnormal and set to NaN.

(b) Choose columns for imputation

We limit to 5 – 10 numeric columns to avoid memory/time issues.

(c) Apply multiple imputation techniques

In [10]:
# ===============================================
# PREPROCESSING
# ===============================================

num_cols = Data.select_dtypes(include=[np.number]).columns
for col in num_cols:
    Data[col] = Data[col].mask(~Data[col].between(0, 250), np.nan)


target_cols = num_cols[:8]
print("\nColumns selected for imputation:", target_cols)


imputed_results = {}

# ---- 1. Mean ----
mean_imp = SimpleImputer(strategy='mean')
df_mean = Data.copy()
df_mean[target_cols] = mean_imp.fit_transform(df_mean[target_cols])
imputed_results["Mean"] = df_mean

# ---- 2. Median ----
median_imp = SimpleImputer(strategy='median')
df_median = Data.copy()
df_median[target_cols] = median_imp.fit_transform(df_median[target_cols])
imputed_results["Median"] = df_median

# ---- 3. Mode ----
mode_imp = SimpleImputer(strategy='most_frequent')
df_mode = Data.copy()
df_mode[target_cols] = mode_imp.fit_transform(df_mode[target_cols])
imputed_results["Mode"] = df_mode

# ---- 4. kNN (k=1) ----
knn_imp = KNNImputer(n_neighbors=1)
df_knn = Data.copy()
df_knn[target_cols] = knn_imp.fit_transform(df_knn[target_cols])
imputed_results["KNN (k=1)"] = df_knn

# ---- 5. Forward-fill ----
df_ffill = Data.copy()
df_ffill[target_cols] = df_ffill[target_cols].fillna(method='ffill')
imputed_results["Forward Fill"] = df_ffill

# ---- 6. Backward-fill ----
df_bfill = Data.copy()
df_bfill[target_cols] = df_bfill[target_cols].fillna(method='bfill')
imputed_results["Backward Fill"] = df_bfill

# ---- 7. MICE----
mice_imp = IterativeImputer(random_state=42, max_iter=5)
df_mice = Data.copy()
df_mice[target_cols] = mice_imp.fit_transform(df_mice[target_cols])
imputed_results["MICE (Iterative)"] = df_mice

# ---- 8. Linear Regression----
df_lr = Data.copy()
for col in target_cols:
    temp = df_lr[target_cols].copy()
    train = temp[temp[col].notna()]
    test = temp[temp[col].isna()]


    if not test.empty and len(train) > 5:
        X_train = train.drop(columns=[col])
        y_train = train[col]
        X_test = test.drop(columns=[col])


        X_train = X_train.fillna(X_train.mean())
        X_test = X_test.fillna(X_train.mean())


        valid_cols = X_train.columns[X_train.notna().all()].tolist()
        if not valid_cols:
            print(f"Skip column '{col}' (all features missing)")
            continue

        X_train = X_train[valid_cols]
        X_test = X_test[valid_cols]


        try:
            model = LinearRegression()
            model.fit(X_train, y_train)
            df_lr.loc[df_lr[col].isna(), col] = model.predict(X_test)
        except Exception as e:
            print(f"Skip column '{col}' due to error: {e}")

imputed_results["Linear Regression"] = df_lr


print("\nAll imputation techniques completed!")


for name, df_imp in imputed_results.items():
    print(f"\n{name}")
    print(df_imp[target_cols].describe().T[['mean','std','min','max']].round(2))


Columns selected for imputation: Index(['Point(139.0960211 36.4047323)', 'Point(139.0428727 36.3816035)',
       'Point(138.9955116 36.33801589999999)', 'Point(139.342672 36.4105658)',
       'Point(139.3817322 36.2909131)', 'Point(139.0432674 36.64710669999999)',
       'Point(139.5317782 36.2499123)', 'Point(138.9940146 36.4990885)'],
      dtype='object')

All imputation techniques completed!

Mean
                                       mean   std  min    max
Point(139.0960211 36.4047323)          9.60  7.12  0.0  110.0
Point(139.0428727 36.3816035)          9.91  6.86  0.0   77.0
Point(138.9955116 36.33801589999999)  13.52  9.67  0.0  200.0
Point(139.342672 36.4105658)          11.40  7.21  0.0   72.0
Point(139.3817322 36.2909131)         10.66  7.73  0.0  200.0
Point(139.0432674 36.64710669999999)   9.81  8.12  0.0  145.0
Point(139.5317782 36.2499123)         11.69  7.20  0.0   70.0
Point(138.9940146 36.4990885)          7.89  1.67  0.0   35.0

Median
                            

#Step 3: Save Mean-Imputed Dataset

In [11]:
# ===============================================
# Save Mean Imputed Data
# ===============================================
imputedData = imputed_results["Mean"]
imputedData.to_csv("/content/imputedData.csv", index=False)
print("\nimputedData.csv saved successfully!")


imputedData.csv saved successfully!


#Step 4: Transformation – Task 1 (Binary Conversion)

Convert continuous values into binary form:
If a value ≥ threshold → 1, otherwise 0.
Default threshold = 35 µg/m³, but for national-average monthly data, 15–25 is often more informative.

In [12]:
# ===============================================
# TRANSFORMATION → Binary Conversion
# ===============================================
df_binary = imputedData.copy()
for col in target_cols:
    df_binary[col] = np.where(df_binary[col] >= 35, 1, 0)

print("\nBinary transformation completed.")
display(df_binary.head())


Binary transformation completed.


Unnamed: 0,Point(139.0960211 36.4047323),Point(139.0428727 36.3816035),Point(138.9955116 36.33801589999999),Point(139.342672 36.4105658),Point(139.3817322 36.2909131),Point(139.0432674 36.64710669999999),Point(139.5317782 36.2499123),Point(138.9940146 36.4990885),Point(138.8939601 36.25898610000001),Point(138.8275195 36.5786787),...,Unnamed: 616,Unnamed: 625,Unnamed: 1023,Unnamed: 1414,Unnamed: 1512,Unnamed: 1562,Unnamed: 1573,Unnamed: 1697,Unnamed: 1709,Unnamed: 1712
0,0,0,0,0,0,0,0,0,4.0,3.0,...,0.0,2.0,,30.0,14.0,4.0,9.0,13.0,17.0,13.0
1,0,0,0,0,0,0,0,0,2.0,4.0,...,4.0,14.0,,12.0,14.0,5.0,7.0,24.0,12.0,5.0
2,0,0,0,0,0,0,0,0,2.0,1.0,...,5.0,7.0,,14.0,9.0,15.0,7.0,18.0,75.0,8.0
3,0,0,0,0,0,0,0,0,5.0,1.0,...,5.0,8.0,,17.0,9.0,3.0,8.0,14.0,10.0,13.0
4,0,0,0,0,0,0,0,0,8.0,9.0,...,4.0,7.0,,16.0,12.0,8.0,7.0,7.0,13.0,4.0


#Step 5: Transformation – Task 2 (Create Transactional Database)

Each row represents one observation.
We create a transaction list of all columns with value 1 in that row.

In [17]:
# ===============================================
# Create Transactional Database
# ===============================================

binary_cols = [c for c in target_cols if c in df_binary.columns]
df_binary[binary_cols] = df_binary[binary_cols].fillna(0).astype(int)


transactions_series = df_binary[binary_cols].apply(
    lambda row: ",".join(row.index[row.eq(1)]),
    axis=1
)


transactional_df = pd.DataFrame({
    "TransactionID": np.arange(len(transactions_series)),
    "Items": transactions_series
})


transactional_df.to_csv("/content/transactionalDatabase.csv", index=False)
print("transactionalDatabase.csv saved successfully!")


!head -n 10 /content/transactionalDatabase.csv

transactionalDatabase.csv saved successfully!
TransactionID,Items
0,
1,
2,
3,
4,
5,
6,
7,
8,
