# Programming Assignment
> ## Project Description & Data Source
> Please download the dataset: `O-A0038-003.xml`. This dataset contains gridded data for "Hourly Temperature Distribution Analysis" from the meteorological observation platform.
> The data specifications are as follows:
> * Each grid point represents a temperature observation in Celsius (°C).
> * The invalid data value is -999.
> * The resolution for both longitude and latitude is 0.03 degrees.
> * The coordinates for the bottom-left grid point are: Longitude 120.00°, Latitude 21.88°.
> * The data increases first along the longitude (67 values per row) and then increases along the latitude (for a total of 120 rows), forming a 67 x 120 grid of numerical values.
> ## Part 1: Data Transformation
> Transform the original data into two supervised learning datasets:
> ### (a) Classification Dataset
> * Format: (Longitude, Latitude, Label)
> * Rules:
>     * If the temperature observation is the invalid value -999, then label = 0.
>     * If the temperature observation is a valid value, then label = 1.
> ### (b) Regression Dataset
> * Format: (Longitude, Latitude, Value)
> * Rules:
>     * Retain only the valid temperature observations (remove all -999 values).
>     * value is the corresponding temperature in Celsius.
> ## Part 2: Model Training
> Using the two datasets prepared in Part 1, train a simple machine learning model for each task:
> ### Classification Model:
> * Use (Longitude, Latitude) to predict whether the grid data represents a valid value (0 or 1).
> ### Regression Model:
> * Use (Longitude, Latitude) to predict the corresponding temperature observation value.

In [None]:
import xml.etree.ElementTree as ET
import numpy as np
from pathlib import Path
import csv

def convert_and_export_weather_data():
    """
    Reads weather data from an XML file, converts it into classification and 
    regression datasets, and exports them into two separate CSV files.
    """
    # --- 1. Read and parse the source XML data ---
    try:
        script_dir = Path(__file__).parent.resolve()
        xml_file = script_dir / 'O-A0038-003.xml'
        tree = ET.parse(xml_file)
        root = tree.getroot()
        namespace = {'cwa': 'urn:cwa:gov:tw:cwacommon:0.1'}
        content_str = root.find('.//cwa:Content', namespace).text
        lines = content_str.strip().split('\n')
        all_floats = []
        for line in lines:
            if not line.strip():
                continue
            floats_in_line = [float(val) for val in line.split(',') if val.strip()]
            all_floats.extend(floats_in_line)
        temp_grid = np.array(all_floats).reshape(120, 67)
    except FileNotFoundError:
        print(f"Error: File not found at '{xml_file}'. Please ensure it is in the same directory.")
        return
    except Exception as e:
        print(f"An error occurred while reading or parsing the XML file: {e}")
        return

    # --- 2. Create the latitude and longitude coordinate grid ---
    start_lon = 120.00
    start_lat = 21.88
    lon_resolution = 0.03
    lat_resolution = 0.03
    lon_points = 67
    lat_points = 120
    longitudes = start_lon + np.arange(lon_points) * lon_resolution
    latitudes = start_lat + np.arange(lat_points) * lat_resolution

    # --- 3. Generate the classification and regression datasets ---
    classification_data = []
    regression_data = []
    for i in range(lat_points):
        for j in range(lon_points):
            lon = longitudes[j]
            lat = latitudes[i]
            temp_value = temp_grid[i, j]
            label = 1 if temp_value != -999.0 else 0
            classification_data.append({'longitude': lon, 'latitude': lat, 'label': label})
            if temp_value != -999.0:
                regression_data.append({'longitude': lon, 'latitude': lat, 'value': temp_value})
    
    print("Data conversion complete.")
    print(f"Total entries in classification dataset: {len(classification_data)}.")
    print(f"Total entries in regression dataset: {len(regression_data)}.")

    # --- 4. Write the data into two separate CSV files ---

    # (a) Write the classification dataset
    classification_csv_file = script_dir / "classification_data.csv"
    try:
        with open(classification_csv_file, 'w', newline='', encoding='utf-8') as f:
            # Define the fieldnames (CSV header)
            fieldnames = ['longitude', 'latitude', 'label']
            # Create a DictWriter object to map dictionaries to CSV rows
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            
            writer.writeheader()  # Write the header row
            writer.writerows(classification_data) # Write all data rows
            
        print(f"\nSuccessfully wrote classification data to: '{classification_csv_file}'")
    except Exception as e:
        print(f"An error occurred while writing the classification CSV: {e}")

    # (b) Write the regression dataset
    regression_csv_file = script_dir / "regression_data.csv"
    try:
        with open(regression_csv_file, 'w', newline='', encoding='utf-8') as f:
            # Define the fieldnames
            fieldnames = ['longitude', 'latitude', 'value']
            writer = csv.DictWriter(f, fieldnames=fieldnames)

            writer.writeheader() # Write the header row
            writer.writerows(regression_data) # Write all data rows
            
        print(f"Successfully wrote regression data to: '{regression_csv_file}'")
    except Exception as e:
        print(f"An error occurred while writing the regression CSV: {e}")


if __name__ == '__main__':
    convert_and_export_weather_data()

## Data Sampling:
```python
from sklearn.model_selection import train_test_split
# 2. Data Splitting
    X_train, X_temp, y_train_ohe, y_temp_ohe = train_test_split(X, y_ohe, test_size=0.3, random_state=42, stratify=y_ohe)
    X_val, X_test, y_val_ohe, y_test_ohe = train_test_split(X_temp, y_temp_ohe, test_size=(1/3), random_state=42, stratify=y_temp_ohe)
```
* Training Data: 70 %
* Validation Data: 20 %
* Test Data: 10 %
## Model: Neural Network
```python
from tensorflow.keras.models import Sequential
# 4. Model
    model = Sequential([
        Dense(32, activation='relu', input_shape=(2,)), # Input => Hidden Layer 1 (32 neurons)
        Dense(32, activation='relu'),                   # Hidden Layer 1 (32 neurons) => Hidden Layer 2 (32 neurons)
        Dense(2, activation='softmax')                  # Output Layer (Softmax)
    ])
```
$$z^{[1]}=W^{[1]}X+b^{[1]}, \text{ where }W^{[1]}: 32\times 2, b^{[1]}: 32\times 1$$
$$a^{[1]}=ReLU(z^{[1]})$$
$$z^{[2]}=W^{[2]}a^{[1]}+b^{[2]}, \text{ where }W^{[1]}: 32\times 32, b^{[1]}: 32\times 1$$
$$a^{[2]}=ReLU(z^{[2]})$$
$$z^{[3]}=W^{[3]}a^{[2]}+b^{[3]}, \text{ where }W^{[1]}: 2\times 32, b^{[1]}: 2\times 1$$
$$\hat{y}=Softmax(z^{[3]})$$
```
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                         ┃ Output Shape                ┃         Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ dense (Dense)                        │ (None, 32)                  │              96 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_1 (Dense)                      │ (None, 32)                  │           1,056 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_2 (Dense)                      │ (None, 2)                   │              66 │
└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘
 Total params: 1,218 (4.76 KB)
 Trainable params: 1,218 (4.76 KB)
 Non-trainable params: 0 (0.00 B)
```
### Loss Function
$$y\text{: one-hot encoding vector of the true label}$$
$$\hat{y}\text{: the probability vector prediction(softmax)}$$
$$y_i, \hat{y}_i\text{: the }i\text{-th component of the vector}$$

#### 1. Euclidean Distance
$$L_{MSE}(y,\hat{y})=\displaystyle\sum_{i=1}^2 (y_i-\hat{y_i})$$
#### 2. Cosine Similarity
$$Similarity(y,\hat{y})=\dfrac{y\cdot\hat{y}}{\|y\|\|\hat{y}\|}$$
$$L_{Similarity}(y,\hat{y})=-Similarity(y,\hat{y})$$
#### 3. Cross Entropy
$$L_{CE}(y,\hat{y})=-\displaystyle\sum_{i=1}^2 y_i\log{(\hat{y_i})}$$
### Optimizer: Adam
```python
# 5. Compile
    model.compile(optimizer='adam',
                  
                  # Option 1: Euclidean Distance
                  loss='mean_squared_error',
                  
                  # Option 2: Cosine Similarity
                  #loss='cosine_similarity',
                  
                  # Option 3: Categorical Crossentropy
                  #loss='categorical_crossentropy',
                  metrics=['accuracy'])
```
### Stop Criterion
The Loss stop decreasing for 10 epochs or reach 100 Epochs.
```python
# 6. Train with Early Stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
```

## Analysis of Classification
### Metrics
TP: True Positive
FP: False Positive
TN: True Negative
FN: False Negative
$$\begin{array}{ccl}Accuracy&=&\dfrac{TP+TN}{TP+FP+TN+FN}\\[10pt]
Precision&=&\dfrac{TP}{TP+FP}\\[10pt]
Recall&=&\dfrac{TP}{TP+FN}\\[10pt]
Brier&=&\dfrac{1}{N}\displaystyle\sum_{i=1}^N(f_i-o_i)^2\end{array}$$
$$\begin{array}{ccl}N&:& \text{ Total number of Samples }\\ f_i&:& \text{ The predicted probability of Class 1 for the }i\text{-th sample }\\ o_i&:& \text{The observed outcome (0 or 1) for the }i\text{-th sample}\end{array}$$

Comparison of 3 Methods（on test set):
| Loss Function | Accuracy | Precision | Recall | Brier Score | Epochs to Stop |
| :--- | :--- | :--- | :--- | :--- | :--- |
| Euclidean Distance | 0.9739 | 0.9532 | **0.9886** | 0.0164 | 100 |
| Cosine Similarity | 0.9739 | **0.9557** | 0.9857 | 0.0178 | 100 |
| Cross-Entropy | 0.9739 | 0.9532 | **0.9886** | **0.0157** | 100 |

For the initial training capped at 100 epochs, the accuracy of all three models on the test set was identical. This suggests that for this simple classification task with a clear topographical boundary, all three loss functions were capable of guiding the model to learn a similar decision boundary. Regarding other metrics, the performance of Euclidean Distance and Cross-Entropy were nearly identical, with Cross-Entropy holding only a slight advantage in its Brier Score. However, it was observed that all three models completed the full 100 epochs, indicating they might not have fully converged. Therefore, the epoch limit was increased to 1000 for a second round of experiments.

The results after extended training are as follows:

Comparison of 3 Methods with more epochs（on test set):
| Loss Function | Accuracy | Precision | Recall | Brier Score | Epochs to Stop |
| :--- | :--- | :--- | :--- | :--- | :--- |
| Euclidean Distance | 0.9764 | 0.9559 | 0.9914 | 0.0162 | 192 |
| Cosine Similarity | **0.9776** | **0.9611** | 0.9886 | 0.0163 | 205 |
| Cross-Entropy | 0.9764 | 0.9534 | **0.9943** | **0.0149** | **169** |

Once again, we can observe that the accuracy of the three models is nearly identical. With extended training, **Cosine Similarity** demonstrated its potential, achieving the best performance in both **Accuracy** and **Precision**. This implies that it can be a strong choice in scenarios where the cost of a false positive (FP) is high.

On the other hand, **Cross-Entropy** was the **fastest** to converge, achieved the highest Recall, and produced the most **reliable** probability predictions (lowest Brier Score). This means that in situations where the cost of a false negative (FN) is high, this loss function would provide the greatest benefit.

### Euclidean Distance
#### First try
```
Epoch 100/100
176/176 - 0s - 435us/step - accuracy: 0.9819 - loss: 0.0143 - val_accuracy: 0.9789 - val_loss: 0.0157
Model training complete.
```
| Data Set | Confusion Matrix | Accuracy | Precision | Recall | Brier Score |
| :--- | :--- | :--- | :--- | :--- | :--- |
| Trainging | $$\begin{pmatrix} 3083 & 99 \\ 33 & 2413 \end{pmatrix}$$| 0.9765 | 0.9606 | 0.9865 | 0.0168 |
| Validation | $$\begin{pmatrix} 884 & 25 \\ 9 & 690 \end{pmatrix}$$ | 0.9789 | 0.9650 | 0.9871 | 0.0157 |
| Test | $$\begin{pmatrix} 437 & 17 \\ 4 & 346 \end{pmatrix}$$ | 0.9739 | 0.9532 | 0.9886 | 0.0164 |

<table>
  <tr>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Correctness_Training_Set.png" width="400"></td>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Confidence_Training_Set.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Correctness_Validation_Set.png" width="400"></td>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Confidence_Validation_Set.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Correctness_Test_Set.png" width="400"></td>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Confidence_Test_Set.png" width="400"></td>
  </tr>
</table>

#### Second try

```
Epoch 192/1000

176/176 - 0s - 449us/step - accuracy: 0.9833 - loss: 0.0130 - val_accuracy: 0.9820 - val_loss: 0.0139

Model training complete.
```
| Data Set | Confusion Matrix | Accuracy | Precision | Recall | Brier Score |
| :--- | :--- | :--- | :--- | :--- | :--- |
| Trainging | $$\begin{pmatrix} 3088 & 94 \\ 30 & 2416 \end{pmatrix}$$| 0.9780 | 0.9625 | 0.9877 | 0.0154 |
| Validation | $$\begin{pmatrix} 888 & 21 \\ 8 & 691 \end{pmatrix}$$ | 0.9820 | 0.9705 | 0.9886 | 0.0139 |
| Test | $$\begin{pmatrix} 438 & 16 \\ 3 & 347 \end{pmatrix}$$ | 0.9764 | 0.9559 | 0.9914 | 0.0162 |
<table>
  <tr>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Correctness_Training_Set_1000.png" width="400"></td>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Confidence_Training_Set_1000.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Correctness_Validation_Set_1000.png" width="400"></td>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Confidence_Validation_Set_1000.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Correctness_Test_Set_1000.png" width="400"></td>
    <td><img src="Figures of Classification/NN_MSE_Prediction_Confidence_Test_Set_1000.png" width="400"></td>
  </tr>
</table>


### Cosine Similarity
#### First try
```
Epoch 100/100
176/176 - 0s - 440us/step - accuracy: 0.9796 - loss: -9.8294e-01 - val_accuracy: 0.9776 - val_loss: -9.8135e-01
Model training complete.
```
| Data Set | Confusion Matrix | Accuracy | Precision | Recall | Brier Score |
| :--- | :--- | :--- | :--- | :--- | :--- |
| Trainging | $$\begin{pmatrix} 3087 & 95 \\ 35 & 2411 \end{pmatrix}$$| 0.9769 | 0.9621 | 0.9857 | 0.0177 |
| Validation | $$\begin{pmatrix} 884 & 25 \\ 11 & 688 \end{pmatrix}$$ | 0.9776 | 0.9649 | 0.9843 | 0.0171 |
| Test | $$\begin{pmatrix} 438 & 16 \\ 5 & 345 \end{pmatrix}$$ | 0.9739 | 0.9557 | 0.9857 | 0.0178 |

<table>
  <tr>
    <td><img src="Figures of Classification/NN_CS_Prediction_Correctness_Training_Set.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CS_Prediction_Confidence_Training_Set.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_CS_Prediction_Correctness_Validation_Set.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CS_Prediction_Confidence_Validation_Set.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_CS_Prediction_Correctness_Test_Set.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CS_Prediction_Confidence_Test_Set.png" width="400"></td>
  </tr>
</table>

#### Second try
```
Epoch 205/1000

176/176 - 0s - 448us/step - accuracy: 0.9819 - loss: -9.8478e-01 - val_accuracy: 0.9801 - val_loss: -9.8380e-01

Model training complete.
```
| Data Set | Confusion Matrix | Accuracy | Precision | Recall | Brier Score |
| :--- | :--- | :--- | :--- | :--- | :--- |
| Trainging | $$\begin{pmatrix} 3088 & 94 \\ 32 & 2414 \end{pmatrix}$$| 0.9776 | 0.9625 | 0.9869 | 0.0157 |
| Validation | $$\begin{pmatrix} 889 & 20 \\ 10 & 689 \end{pmatrix}$$ | 0.9813 | 0.9718 | 0.9857 | 0.0146 |
| Test | $$\begin{pmatrix} 440 & 14 \\ 4 & 346 \end{pmatrix}$$ | 0.9776 | 0.9611 | 0.9886 | 0.0163 |
<table>
  <tr>
    <td><img src="Figures of Classification/NN_CS_Prediction_Correctness_Training_Set_1000.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CS_Prediction_Confidence_Training_Set_1000.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_CS_Prediction_Correctness_Validation_Set_1000.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CS_Prediction_Confidence_Validation_Set_1000.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_CS_Prediction_Correctness_Test_Set_1000.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CS_Prediction_Confidence_Test_Set_1000.png" width="400"></td>
  </tr>
</table>


### Cross Entropy
#### First Try
```
Epoch 100/100
176/176 - 0s - 438us/step - accuracy: 0.9799 - loss: 0.0520 - val_accuracy: 0.9813 - val_loss: 0.0498
Model training complete.
```
| Data Set | Confusion Matrix | Accuracy | Precision | Recall | Brier Score |
| :--- | :--- | :--- | :--- | :--- | :--- |
| Trainging | $$\begin{pmatrix} 3091 & 91 \\ 24 & 2422 \end{pmatrix}$$| 0.9796 | 0.9638 | 0.9902 | 0.0160 |
| Validation | $$\begin{pmatrix} 885 & 24 \\ 6 & 693 \end{pmatrix}$$ | 0.9813 | 0.9665 | 0.9914 | 0.0142 |
| Test | $$\begin{pmatrix} 437 & 17 \\ 4 & 346 \end{pmatrix}$$ | 0.9739 | 0.9532 | 0.9886 | 0.0157 |

<table>
  <tr>
    <td><img src="Figures of Classification/NN_CE_Prediction_Correctness_Training_Set.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CE_Prediction_Confidence_Training_Set.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_CE_Prediction_Correctness_Validation_Set.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CE_Prediction_Confidence_Validation_Set.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_CE_Prediction_Correctness_Test_Set.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CE_Prediction_Confidence_Test_Set.png" width="400"></td>
  </tr>
</table>

#### Second Try
```
Epoch 169/1000

176/176 - 0s - 434us/step - accuracy: 0.9817 - loss: 0.0472 - val_accuracy: 0.9820 - val_loss: 0.0472

Model training complete.
```
| Data Set | Confusion Matrix | Accuracy | Precision | Recall | Brier Score |
| :--- | :--- | :--- | :--- | :--- | :--- |
| Trainging | $$\begin{pmatrix} 3097 & 85 \\ 22 & 2424 \end{pmatrix}$$| 0.9810 | 0.9661 | 0.9910 | 0.0147 |
| Validation | $$\begin{pmatrix} 887 & 22 \\ 6 & 693 \end{pmatrix}$$ | 0.9826 | 0.9692 | 0.9914 | 0.0133 |
| Test | $$\begin{pmatrix} 437 & 17 \\ 2 & 348 \end{pmatrix}$$ | 0.9764 | 0.9534 | 0.9943 | 0.0149 |
<table>
  <tr>
    <td><img src="Figures of Classification/NN_CE_Prediction_Correctness_Training_Set_1000.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CE_Prediction_Confidence_Training_Set_1000.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_CE_Prediction_Correctness_Validation_Set_1000.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CE_Prediction_Confidence_Validation_Set_1000.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Classification/NN_CE_Prediction_Correctness_Test_Set_1000.png" width="400"></td>
    <td><img src="Figures of Classification/NN_CE_Prediction_Confidence_Test_Set_1000.png" width="400"></td>
  </tr>
</table>

## Analysis of Regression
```
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                         ┃ Output Shape                ┃         Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ dense (Dense)                        │ (None, 64)                  │             192 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_1 (Dense)                      │ (None, 64)                  │           4,160 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_2 (Dense)                      │ (None, 1)                   │              65 │
└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘
 Total params: 4,417 (17.25 KB)
 Trainable params: 4,417 (17.25 KB)
 Non-trainable params: 0 (0.00 B)
Epoch 100/100
77/77 - 0s - 619us/step - loss: 9.6671 - mae: 2.2426 - mse: 9.6671 - val_loss: 11.1105 - val_mae: 2.4173 - val_mse: 11.1105
Model training complete.
```
| Data Set | MSE | RMSE(°C) | MAE(°C) | Max Error(°C) |
| :--- | :--- | :--- | :--- | :--- |
| Trainging | 9.4897 | 3.0805 | 2.1981 | 14.6367 |
| Validation | 11.1105 | 3.3332 | 2.4173 | 13.2554 |
| Test | 9.1782 | 3.0296 | 2.2110 | 10.5454 |
<table>
  <tr>
    <td><img src="Figures of Regression/NN_Actual_Temperature_Training_Set.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Predicted_Temperature_Training_Set.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Temperature_Error_Training_Set.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Regression/NN_Actual_Temperature_Validation_Set.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Predicted_Temperature_Validation_Set.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Temperature_Error_Validation_Set.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Regression/NN_Actual_Temperature_Test_Set.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Predicted_Temperature_Test_Set.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Temperature_Error_Test_Set.png" width="400"></td>
  </tr>
</table>

From the results, we can observe that the model has successfully learned the general temperature distribution on the training set; in other words, the model's complexity is sufficient for the data.

However, the error distribution plots clearly indicate that the model lacks a spatial concept. It is unable to comprehend the drastic topographical changes of the mountain ranges, only learning that temperatures in certain longitude/latitude zones might be lower, while ignoring the complex and sharp variations within those mountainous regions. This is particularly evident in the Max Error metric. In the initial 100-epoch run, the error on the test set reached 10.5°C. If this model were to be used in a practical application, it would likely lead to many complaints.

Similarly, the model reached the 100-epoch limit, so the training ceiling was raised to allow for full convergence. The results are as follows:

```
Epoch 695/1000
77/77 - 0s - 637us/step - loss: 7.0278 - mae: 1.8375 - mse: 7.0278 - val_loss: 8.2070 - val_mae: 1.9408 - val_mse: 8.2070
Model training complete.
```
| Data Set | MSE | RMSE(°C) | MAE(°C) | Max Error(°C) |
| :--- | :--- | :--- | :--- | :--- |
| Trainging | 6.8407 | 2.6086 | 1.7507 | 13.5421 |
| Validation | 8.0210 | 2.8637 | 1.9421 | 10.9425 |
| Test | 7.2721 | 2.6967 | 1.8694 | 8.3852 |
<table>
  <tr>
    <td><img src="Figures of Regression/NN_Actual_Temperature_Training_Set_1000.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Predicted_Temperature_Training_Set_1000.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Temperature_Error_Training_Set_1000.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Regression/NN_Actual_Temperature_Validation_Set_1000.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Predicted_Temperature_Validation_Set_1000.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Temperature_Error_Validation_Set_1000.png" width="400"></td>
  </tr>
  <tr>
    <td><img src="Figures of Regression/NN_Actual_Temperature_Test_Set_1000.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Predicted_Temperature_Test_Set_1000.png" width="400"></td>
    <td><img src="Figures of Regression/NN_Temperature_Error_Test_Set_1000.png" width="400"></td>
  </tr>
</table>

With extended training, we can see a clear improvement in the metrics, and the overall predicted trend appears smoother. The error in the mountainous regions has also been mitigated, especially concerning the extreme error points. Nevertheless, the primary source of error remains concentrated in the Central Mountain Range.

The maximum error is still significant. For an application like a weather forecast, where high precision is expected by the public, a maximum error of over 8°C on the test set is still unacceptable.

From this, we can deduce a key insight: longitude and latitude alone can only represent general temperature trends. To predict such "localized" temperatures accurately, other parameters like **altitude** and **humidity** must be considered. This demonstrates that prior domain knowledge and thorough data collection play a crucial role in the success of a machine learning project.

# Code

# Data Transformation

In [None]:
import xml.etree.ElementTree as ET
import numpy as np
from pathlib import Path
import csv

def convert_and_export_weather_data():
    """
    Reads weather data from an XML file, converts it into classification and 
    regression datasets, and exports them into two separate CSV files.
    """
    # --- 1. Read and parse the source XML data ---
    try:
        script_dir = Path(__file__).parent.resolve()
        xml_file = script_dir / 'O-A0038-003.xml'
        tree = ET.parse(xml_file)
        root = tree.getroot()
        namespace = {'cwa': 'urn:cwa:gov:tw:cwacommon:0.1'}
        content_str = root.find('.//cwa:Content', namespace).text
        lines = content_str.strip().split('\n')
        all_floats = []
        for line in lines:
            if not line.strip():
                continue
            floats_in_line = [float(val) for val in line.split(',') if val.strip()]
            all_floats.extend(floats_in_line)
        temp_grid = np.array(all_floats).reshape(120, 67)
    except FileNotFoundError:
        print(f"Error: File not found at '{xml_file}'. Please ensure it is in the same directory.")
        return
    except Exception as e:
        print(f"An error occurred while reading or parsing the XML file: {e}")
        return

    # --- 2. Create the latitude and longitude coordinate grid ---
    start_lon = 120.00
    start_lat = 21.88
    lon_resolution = 0.03
    lat_resolution = 0.03
    lon_points = 67
    lat_points = 120
    longitudes = start_lon + np.arange(lon_points) * lon_resolution
    latitudes = start_lat + np.arange(lat_points) * lat_resolution

    # --- 3. Generate the classification and regression datasets ---
    classification_data = []
    regression_data = []
    for i in range(lat_points):
        for j in range(lon_points):
            lon = longitudes[j]
            lat = latitudes[i]
            temp_value = temp_grid[i, j]
            label = 1 if temp_value != -999.0 else 0
            classification_data.append({'longitude': lon, 'latitude': lat, 'label': label})
            if temp_value != -999.0:
                regression_data.append({'longitude': lon, 'latitude': lat, 'value': temp_value})
    
    print("Data conversion complete.")
    print(f"Total entries in classification dataset: {len(classification_data)}.")
    print(f"Total entries in regression dataset: {len(regression_data)}.")

    # --- 4. Write the data into two separate CSV files ---

    # (a) Write the classification dataset
    classification_csv_file = script_dir / "classification_data.csv"
    try:
        with open(classification_csv_file, 'w', newline='', encoding='utf-8') as f:
            # Define the fieldnames (CSV header)
            fieldnames = ['longitude', 'latitude', 'label']
            # Create a DictWriter object to map dictionaries to CSV rows
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            
            writer.writeheader()  # Write the header row
            writer.writerows(classification_data) # Write all data rows
            
        print(f"\nSuccessfully wrote classification data to: '{classification_csv_file}'")
    except Exception as e:
        print(f"An error occurred while writing the classification CSV: {e}")

    # (b) Write the regression dataset
    regression_csv_file = script_dir / "regression_data.csv"
    try:
        with open(regression_csv_file, 'w', newline='', encoding='utf-8') as f:
            # Define the fieldnames
            fieldnames = ['longitude', 'latitude', 'value']
            writer = csv.DictWriter(f, fieldnames=fieldnames)

            writer.writeheader() # Write the header row
            writer.writerows(regression_data) # Write all data rows
            
        print(f"Successfully wrote regression data to: '{regression_csv_file}'")
    except Exception as e:
        print(f"An error occurred while writing the regression CSV: {e}")


if __name__ == '__main__':
    convert_and_export_weather_data()

## Classification (NN)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, brier_score_loss
from pathlib import Path

def evaluate_and_plot_nn(model, X_scaled, y_ohe, dataset_name, save_dir):
    print(f"--- Evaluating on {dataset_name} Set ---")
    
    # Predictions
    y_prob = model.predict(X_scaled)
    y_pred = np.argmax(y_prob, axis=1)
    y_true = np.argmax(y_ohe, axis=1)
    
    # Metrics
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    brier = brier_score_loss(y_true, y_prob[:, 1])

    print("Confusion Matrix:")
    print(cm)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Confidence MSE (Brier Score): {brier:.4f}\n")

    # For plotting
    X_unscaled = scaler.inverse_transform(X_scaled)
    
    # Accuracy Plot
    plt.figure(figsize=(10, 8))
    title1 = f'NN_Prediction_Correctness_{dataset_name}_Set'
    correct_predictions = (y_true == y_pred)
    plt.scatter(X_unscaled[correct_predictions, 0], X_unscaled[correct_predictions, 1], 
                c='green', label='Correct', alpha=0.6, s=10)
    plt.scatter(X_unscaled[~correct_predictions, 0], X_unscaled[~correct_predictions, 1], 
                c='red', label='Incorrect', alpha=0.6, s=10)
    plt.title(f'NN Prediction Correctness ({dataset_name} Set)')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.legend()
    plt.grid(True)
    save_path1 = save_dir / f"{title1}.png"
    plt.savefig(save_path1, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Plot saved to: {save_path1}")

    # Confidence Plot
    plt.figure(figsize=(10, 8))
    title2 = f'NN_Prediction_Confidence_{dataset_name}_Set'
    scatter = plt.scatter(X_unscaled[:, 0], X_unscaled[:, 1], c=y_prob[:, 1], 
                          cmap='coolwarm', vmin=0, vmax=1, s=10)
    plt.colorbar(scatter, label='Probability of being Valid (Label=1)')
    plt.title(f'NN Prediction Confidence ({dataset_name} Set)')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.grid(True)
    save_path2 = save_dir / f"{title2}.png"
    plt.savefig(save_path2, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Plot saved to: {save_path2}\n")

def main():
    seed_value = 42
    import os
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    import random
    random.seed(seed_value)
    np.random.seed(seed_value)
    tf.random.set_seed(seed_value)

    script_dir = Path(__file__).parent.resolve()
    data_file_path = script_dir / 'classification_data.csv'
    
    try:
        data = pd.read_csv(data_file_path)
    except FileNotFoundError:
        print(f"Error: '{data_file_path}' not found.")
        return

    X = data[['longitude', 'latitude']]
    y = data['label']

    # 1. One-hot Encode
    y_ohe = to_categorical(y, num_classes=2)

    # 2. Data Splitting
    X_train, X_temp, y_train_ohe, y_temp_ohe = train_test_split(X, y_ohe, test_size=0.3, random_state=42, stratify=y_ohe)
    X_val, X_test, y_val_ohe, y_test_ohe = train_test_split(X_temp, y_temp_ohe, test_size=(1/3), random_state=42, stratify=y_temp_ohe)

    # 3. Data Scaling
    global scaler 
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # 4. Model
    model = Sequential([
        Dense(32, activation='relu', input_shape=(2,)), # Input => Hidden Layer 1 
        Dense(32, activation='relu'),                   # Hidden Layer 1 => Hidden Layer 2
        Dense(2, activation='softmax')                  # Output Layer (Softmax)
    ])

    # 5. Compile
    model.compile(optimizer='adam',
                  
                  # Option 1: Euclidean Distance
                  #loss='mean_squared_error',
                  
                  # Option 2: Cosine Similarity
                  #loss='cosine_similarity',
                  
                  # Option 3: Categorical Crossentropy
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.summary()

    # 6. Train with Early Stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    print("\nTraining the Neural Network model...")
    history = model.fit(X_train_scaled, y_train_ohe,
                        epochs=100,
                        batch_size=32,
                        validation_data=(X_val_scaled, y_val_ohe),
                        callbacks=[early_stopping],
                        verbose=2) 
    print("Model training complete.\n")

    # 7. Evaluate and Plot
    evaluate_and_plot_nn(model, X_train_scaled, y_train_ohe, 'Training', script_dir)
    evaluate_and_plot_nn(model, X_val_scaled, y_val_ohe, 'Validation', script_dir)
    evaluate_and_plot_nn(model, X_test_scaled, y_test_ohe, 'Test', script_dir)

if __name__ == '__main__':
    main()

## Regression (NN)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, max_error
from pathlib import Path

def evaluate_and_plot_regression_nn(model, X_scaled, y, dataset_name, save_dir):
    """
    Evaluates the regression neural network and saves the resulting plots.
    """
    print(f"--- Evaluating on {dataset_name} Set ---")

    # Predictions
    y_pred = model.predict(X_scaled).flatten() # Flatten to make it a 1D array

    # Metrics
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, y_pred)
    max_err = max_error(y, y_pred)
    
    print(f"Temperature MSE: {mse:.4f}")
    print(f"Temperature RMSE: {rmse:.4f} (°C)")
    print(f"Temperature MAE: {mae:.4f} (°C)")
    print(f"Max Temperature Error: {max_err:.4f} (°C)\n")

    # For plotting, unscale the features
    X_unscaled = scaler.inverse_transform(X_scaled)
    
    # --- Plotting and Saving ---
    vmin = min(y.min(), y_pred.min())
    vmax = max(y.max(), y_pred.max())
    
    # Plot 1: Actual Temperature Distribution
    plt.figure(figsize=(10, 8))
    title1 = f'NN_Actual_Temperature_{dataset_name}_Set'
    scatter1 = plt.scatter(X_unscaled[:, 0], X_unscaled[:, 1], c=y, cmap='viridis', 
                           vmin=vmin, vmax=vmax, s=15)
    plt.colorbar(scatter1, label='Actual Temperature (°C)')
    plt.title(f'NN Actual Temperature Distribution ({dataset_name} Set)')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.grid(True)
    save_path1 = save_dir / f"{title1}.png"
    plt.savefig(save_path1, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Plot saved to: {save_path1}")

    # Plot 2: Predicted Temperature Distribution
    plt.figure(figsize=(10, 8))
    title2 = f'NN_Predicted_Temperature_{dataset_name}_Set'
    scatter2 = plt.scatter(X_unscaled[:, 0], X_unscaled[:, 1], c=y_pred, cmap='viridis',
                           vmin=vmin, vmax=vmax, s=15)
    plt.colorbar(scatter2, label='Predicted Temperature (°C)')
    plt.title(f'NN Predicted Temperature Distribution ({dataset_name} Set)')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.grid(True)
    save_path2 = save_dir / f"{title2}.png"
    plt.savefig(save_path2, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Plot saved to: {save_path2}")

    # Plot 3: Temperature Prediction Error
    plt.figure(figsize=(10, 8))
    title3 = f'NN_Temperature_Error_{dataset_name}_Set'
    errors = y_pred - y
    error_max_abs = np.abs(errors).max()
    scatter3 = plt.scatter(X_unscaled[:, 0], X_unscaled[:, 1], c=errors, cmap='coolwarm',
                           vmin=-error_max_abs, vmax=error_max_abs, s=15)
    plt.colorbar(scatter3, label='Prediction Error (°C)')
    plt.title(f'NN Temperature Prediction Error ({dataset_name} Set)')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.grid(True)
    save_path3 = save_dir / f"{title3}.png"
    plt.savefig(save_path3, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Plot saved to: {save_path3}\n")


def main():
    seed_value = 42
    import os
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    import random
    random.seed(seed_value)
    np.random.seed(seed_value)
    tf.random.set_seed(seed_value)
    script_dir = Path(__file__).parent.resolve()
    data_file_path = script_dir / 'regression_data.csv'
    
    try:
        data = pd.read_csv(data_file_path)
    except FileNotFoundError:
        print(f"Error: '{data_file_path}' not found.")
        return

    X = data[['longitude', 'latitude']]
    y = data['value']

    # 1. Data Splitting
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(1/3), random_state=42)

    # 2. Data Scaling
    global scaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # 3. Model Architecture
    model = Sequential([
        Dense(64, activation='relu', input_shape=(2,)), # Input => Hidden Layer 1
        Dense(64, activation='relu'),                   # Hidden Layer 1 => Hidden Layer 2
        Dense(1)                                        # Output Layer (1 neuron, linear activation)
    ])

    # 4. Compile Model
    model.compile(optimizer='adam',
                  loss='mean_squared_error', # Standard loss for regression
                  metrics=['mae', 'mse']) # Track Mean Absolute Error and Mean Squared Error
    
    model.summary()

    # 5. Train with Early Stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    print("\nTraining the Neural Network model for regression...")
    history = model.fit(X_train_scaled, y_train,
                        epochs=100,
                        batch_size=32,
                        validation_data=(X_val_scaled, y_val),
                        callbacks=[early_stopping],
                        verbose=2)
    print("Model training complete.\n")

    # 6. Evaluate and Plot
    evaluate_and_plot_regression_nn(model, X_train_scaled, y_train, 'Training', script_dir)
    evaluate_and_plot_regression_nn(model, X_val_scaled, y_val, 'Validation', script_dir)
    evaluate_and_plot_regression_nn(model, X_test_scaled, y_test, 'Test', script_dir)

if __name__ == '__main__':
    main()