<a href="https://colab.research.google.com/github/SatoJin02/BDA_course25/blob/main/Ex05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# ============================================
# Step 0. Download dataset directly in Colab
# ============================================
import os
import urllib.request
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

url = "https://www.dropbox.com/s/wa8d1sujzlx56hh/ETL_DATA_new.csv?dl=1"
save_path = "/content/ETL_DATA_new.csv"

if not os.path.exists(save_path):
    urllib.request.urlretrieve(url, save_path)
print("Downloaded to:", save_path)

Downloaded to: /content/ETL_DATA_new.csv


In [17]:
# ============================================
# Step 1. Read dataset and show basic info
# ============================================

Data = pd.read_csv(save_path)
print("Loaded CSV")
print("Shape:", Data.shape)
display(Data.head(5))
print("\nDtypes:")
print(Data.dtypes.head(20))

Loaded CSV
Shape: (46005, 1833)


Unnamed: 0,Unnamed: 1,TimeStamp,Point(139.0794379 36.3727776),Point(139.1051411 36.3963822),Point(139.0960211 36.4047323),Point(139.0428727 36.3816035),Point(138.9955116 36.33801589999999),Point(139.342672 36.4105658),Point(139.3526243 36.3695416),Point(139.1945766 36.31351160000001),...,Point(139.9418164 36.7656467),Point(140.0549894 36.9688923),Point(139.8775674 36.3847082),Point(139.9101767 36.4393022),Point(139.9074816 36.4445767),Point(140.0934838 36.4673588),Point(139.7422865 36.2305774),Point(139.7151723 36.822353),Point(140.1510903 36.6598314),Unnamed: 1832
0,0,2018-01-01 01:00:00,,,5.0,13.0,18.0,20.0,,,...,,,,,,6.0,,,4.0,
1,1,2018-01-01 02:00:00,,,11.0,12.0,22.0,15.0,,,...,,6.0,,,,9.0,,,5.0,
2,2,2018-01-01 03:00:00,,,7.0,12.0,19.0,16.0,,,...,,0.0,,,,10.0,,,6.0,
3,3,2018-01-01 04:00:00,,,5.0,11.0,16.0,11.0,,,...,,2.0,,,,11.0,,,11.0,
4,4,2018-01-01 05:00:00,,,6.0,11.0,10.0,8.0,,,...,,4.0,,,,8.0,,,6.0,



Dtypes:
                                          int64
TimeStamp                                object
Point(139.0794379 36.3727776)           float64
Point(139.1051411 36.3963822)           float64
Point(139.0960211 36.4047323)           float64
Point(139.0428727 36.3816035)           float64
Point(138.9955116 36.33801589999999)    float64
Point(139.342672 36.4105658)            float64
Point(139.3526243 36.3695416)           float64
Point(139.1945766 36.31351160000001)    float64
Point(139.2076974 36.3034767)           float64
Point(139.3817322 36.2909131)           float64
Point(139.3868953 36.2780216)           float64
Point(139.0432674 36.64710669999999)    float64
Point(139.5317782 36.2499123)           float64
Point(139.5202506 36.2351772)           float64
Point(138.9940146 36.4990885)           float64
Point(139.0120412 36.4921403)           float64
Point(138.8939601 36.25898610000001)    float64
Point(138.9138437 36.323256)            float64
dtype: object


In [18]:
# ============================================
# Step 2. Select target dataset
#  - Replace out-of-range (0..250) with NaN (numeric cols)
#  - Remove columns with no location information
#  - Drop columns with >=80% NaN
# ============================================

loc_keywords = ["point", "lat", "lon", "location", "station", "address", "geo"]
location_cols = [c for c in Data.columns if any(k in c.lower() for k in loc_keywords)]

numeric_cols_all = Data.select_dtypes(include=[np.number]).columns.tolist()

target_df = Data.copy()
for col in numeric_cols_all:
    target_df[col] = target_df[col].mask(~target_df[col].between(0, 250), np.nan)

keep_cols = list(set(location_cols) | set(numeric_cols_all))
target_df = target_df[keep_cols]

target_df = target_df.loc[:, target_df.isna().mean() < 0.8]

print("\nTarget dataset selected")
print("Shape:", target_df.shape)
print("Kept columns (sample):", list(target_df.columns)[:15])


Target dataset selected
Shape: (46005, 1121)
Kept columns (sample): ['Point(136.6008578 36.5464656)', 'Point(135.763971 34.745738)', 'Point(130.3227599 31.4168733)', 'Point(135.4523554 34.68306949999999)', 'Point(130.0889357 33.3001304)', 'Point(139.6985104 35.6641008)', 'Point(131.427765 31.9437073)', 'Point(141.1158996 39.3900124)', 'Point(138.3802631 34.9701157)', 'Point(140.8278262 38.2211062)', 'Point(139.7677638 36.1765147)', 'Point(136.6507388 36.5670356)', 'Point(139.4975649 35.5914051)', 'Point(139.7555691 35.7331806)', 'Point(139.5801042 35.58478059999999)']


In [19]:
# ============================================
# Step 3. Create preprocessed dataset via imputation
#  - Choose one technique (here: Median imputation: robust to outliers)
# ============================================

pre_df = target_df.copy()
numeric_cols = pre_df.select_dtypes(include=[np.number]).columns.tolist()

if len(numeric_cols) == 0:
    raise ValueError("No numeric columns left after selection. Please review the filters.")

imp = SimpleImputer(strategy="median")
pre_df[numeric_cols] = imp.fit_transform(pre_df[numeric_cols])

print("\nPreprocessed dataset (imputed by Median)")
print("Shape:", pre_df.shape)
display(pre_df[numeric_cols].head(5).round(3))


Preprocessed dataset (imputed by Median)
Shape: (46005, 1121)


Unnamed: 0,Point(136.6008578 36.5464656),Point(135.763971 34.745738),Point(130.3227599 31.4168733),Point(135.4523554 34.68306949999999),Point(130.0889357 33.3001304),Point(139.6985104 35.6641008),Point(131.427765 31.9437073),Point(141.1158996 39.3900124),Point(138.3802631 34.9701157),Point(140.8278262 38.2211062),...,Point(139.9226579 35.70954990000001),Point(136.7511741 35.385969),Point(135.4104432 34.7935224),Point(135.0828637 35.1294529),Point(130.5574211 31.5967656),Point(138.269266 34.848539),Point(135.960524 35.0122876),Point(136.5528094 34.87787540000001),Point(134.1624967 34.0655576),Point(136.6113011 35.3579319)
0,15.0,16.0,21.0,3.0,16.0,27.0,14.0,18.0,1.0,5.0,...,9.0,16.0,15.0,7.0,13.0,5.0,16.0,18.0,10.0,19.0
1,5.0,16.0,17.0,6.0,14.0,24.0,11.0,17.0,5.0,6.0,...,9.0,17.0,15.0,10.0,9.0,6.0,14.0,17.0,16.0,19.0
2,9.0,16.0,15.0,10.0,14.0,23.0,14.0,24.0,8.0,4.0,...,9.0,15.0,18.0,12.0,10.0,6.0,14.0,8.0,11.0,24.0
3,6.0,14.0,15.0,13.0,15.0,25.0,13.0,23.0,6.0,4.0,...,9.0,18.0,16.0,13.0,12.0,6.0,14.0,10.0,12.0,8.0
4,2.0,15.0,18.0,13.0,19.0,25.0,8.0,22.0,3.0,4.0,...,9.0,14.0,11.0,13.0,16.0,4.0,12.0,14.0,12.0,12.0


In [20]:
# ============================================
# Step 4. Create transformed dataset (transpose)
#   - Rows become columns, columns become rows
#   - Work on numeric-only for decomposition
# ============================================
num_only = pre_df[numeric_cols].copy()

transposedDF = num_only.T.copy()
print("\nTransposed dataset created as 'transposedDF'")
print("transposedDF shape:", transposedDF.shape)
display(transposedDF.iloc[:5, :5])


Transposed dataset created as 'transposedDF'
transposedDF shape: (1121, 46005)


Unnamed: 0,0,1,2,3,4
Point(136.6008578 36.5464656),15.0,5.0,9.0,6.0,2.0
Point(135.763971 34.745738),16.0,16.0,16.0,14.0,15.0
Point(130.3227599 31.4168733),21.0,17.0,15.0,15.0,18.0
Point(135.4523554 34.68306949999999),3.0,6.0,10.0,13.0,13.0
Point(130.0889357 33.3001304),16.0,14.0,14.0,15.0,19.0


In [21]:
# ============================================
# Step 5-a. PCA on transposedDF
#   - Retain components explaining 95% variance
#   - Show #components and explained variance ratio (sum)
# ============================================
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, svd_solver="full")
pca_scores = pca.fit_transform(transposedDF.fillna(0))
n_pcs = pca.n_components_
explained = pca.explained_variance_ratio_
print("\nPCA done")
print("Retained principal components:", n_pcs)
print("Explained variance (sum):", round(float(explained.sum()), 4))
print("Explained variance ratio (first 10):", np.round(explained[:10], 4))


PCA done
Retained principal components: 726
Explained variance (sum): 0.95
Explained variance ratio (first 10): [0.1615 0.0825 0.0527 0.0357 0.0284 0.0197 0.0174 0.0166 0.0141 0.0111]


In [22]:
# ============================================
# Step 5-b. SVD (TruncatedSVD) on transposedDF
# ============================================
from sklearn.decomposition import TruncatedSVD

# choose up to 10 components or smaller depending on matrix shape
svd_components = min(10, transposedDF.shape[1] - 1, transposedDF.shape[0] - 1)
svd_components = max(svd_components, 2)  # ensure at least 2
svd = TruncatedSVD(n_components=svd_components, random_state=42)
svd_scores = svd.fit_transform(transposedDF.fillna(0))

print("\nSVD (TruncatedSVD) done")
print("Chosen components:", svd_components)
print("Explained variance (sum):", round(float(svd.explained_variance_ratio_.sum()), 4))
print("Explained variance ratio (first 10):", np.round(svd.explained_variance_ratio_[:10], 4))


SVD (TruncatedSVD) done
Chosen components: 10
Explained variance (sum): 0.4366
Explained variance ratio (first 10): [0.1227 0.102  0.0528 0.038  0.0309 0.0281 0.0184 0.0172 0.0142 0.0123]


In [23]:
# ============================================
# Step 5-c. LDA on transposed dataset
#   - Create categorical target y by binning one numeric column into 3 classes
#   - Max retained components = (#classes - 1) = 2
# ============================================
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

ref_col = transposedDF.columns[0]
y = pd.qcut(transposedDF[ref_col], q=3, labels=[0,1,2])

lda = LDA(n_components=2)
X_lda = lda.fit_transform(transposedDF.fillna(0), y)
print("\nLDA done")
print("Classes:", np.unique(y).tolist())
print("Retained LDA components:", X_lda.shape[1])


LDA done
Classes: [0, 1, 2]
Retained LDA components: 2
