In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

print(f'Libraries loaded successfully at {datetime.now()}')

Libraries loaded successfully at 2025-11-28 23:17:04.022101


## Step 1: Load Data

In [16]:
file_path = 'd:/S2/prediksi - hujan/merged_all_data_complete.csv'

try:
    df = pd.read_csv(file_path)
    print(f'Data loaded successfully')
    print(f'File size: {len(df):,} rows')
except Exception as e:
    print(f'Error loading file: {e}')

Data loaded successfully
File size: 365,400 rows


## Step 2: Basic Information

In [17]:
print(f'Shape: {df.shape}')
print(f'\nColumn Names and Types:')
print(df.dtypes)
print(f'\nColumn List:')
for i, col in enumerate(df.columns, 1):
    print(f'{i:2d}. {col}')

Shape: (365400, 18)

Column Names and Types:
valid_time     object
latitude      float64
longitude     float64
tp            float64
ro            float64
number          int64
expver          int64
u10           float64
v10           float64
t2m           float64
swvl1         float64
year            int64
month           int64
day             int64
hour            int64
dayofweek       int64
dayofyear       int64
wind_speed    float64
dtype: object

Column List:
 1. valid_time
 2. latitude
 3. longitude
 4. tp
 5. ro
 6. number
 7. expver
 8. u10
 9. v10
10. t2m
11. swvl1
12. year
13. month
14. day
15. hour
16. dayofweek
17. dayofyear
18. wind_speed


## Step 3: First and Last Rows

In [18]:
print('First 5 rows:')
print(df.head())
print('\nLast 5 rows:')
print(df.tail())

First 5 rows:
            valid_time  latitude  longitude        tp   ro  number  expver  \
0  2020-01-01 00:00:00       6.0      95.00  0.000343  0.0       0       1   
1  2020-01-01 00:00:00       6.0      95.25  0.000315  0.0       0       1   
2  2020-01-01 00:00:00       6.0      95.50  0.000282  0.0       0       1   
3  2020-01-01 00:00:00       6.0      95.75  0.000219  0.0       0       1   
4  2020-01-01 00:00:00       6.0      96.00  0.000190  0.0       0       1   

        u10       v10        t2m         swvl1  year  month  day  hour  \
0 -9.005981 -0.869095  299.17688  3.054738e-07  2020      1    1     0   
1 -9.242310 -0.299759  299.19250  3.054738e-07  2020      1    1     0   
2 -9.186645  0.308640  299.37024  3.054738e-07  2020      1    1     0   
3 -8.957153  0.703171  299.77454  3.054738e-07  2020      1    1     0   
4 -9.144653  0.884811  300.13196  3.054738e-07  2020      1    1     0   

   dayofweek  dayofyear  wind_speed  
0          2          1    9.04781

## Step 4: Missing Values Check

In [19]:
missing_count = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': missing_count.values,
    'Missing_Percentage': missing_pct.values
})

print(missing_df.to_string(index=False))
print(f'\nTotal missing values: {df.isnull().sum().sum()}')
print(f'Total cells: {df.size:,}')

    Column  Missing_Count  Missing_Percentage
valid_time              0                 0.0
  latitude              0                 0.0
 longitude              0                 0.0
        tp              0                 0.0
        ro              0                 0.0
    number              0                 0.0
    expver              0                 0.0
       u10              0                 0.0
       v10              0                 0.0
       t2m              0                 0.0
     swvl1              0                 0.0
      year              0                 0.0
     month              0                 0.0
       day              0                 0.0
      hour              0                 0.0
 dayofweek              0                 0.0
 dayofyear              0                 0.0
wind_speed              0                 0.0

Total missing values: 0
Total cells: 6,577,200

Total missing values: 0
Total cells: 6,577,200


## Step 5: Descriptive Statistics

In [20]:
print(df.describe().round(6))

            latitude      longitude             tp             ro    number  \
count  365400.000000  365400.000000  365400.000000  365400.000000  365400.0   
mean        5.500000      95.500000       0.000281       0.000062       0.0   
std         0.353554       0.353554       0.000840       0.000312       0.0   
min         5.000000      95.000000       0.000000       0.000000       0.0   
25%         5.250000      95.250000       0.000000       0.000000       0.0   
50%         5.500000      95.500000       0.000022       0.000000       0.0   
75%         5.750000      95.750000       0.000184       0.000033       0.0   
max         6.000000      96.000000       0.036000       0.025565       0.0   

         expver            u10            v10            t2m          swvl1  \
count  365400.0  365400.000000  365400.000000  365400.000000  365400.000000   
mean        1.0      -0.206522       0.846650     299.801580       0.120828   
std         0.0       2.943038       2.275245      

## Step 6: Data Type Validation

In [21]:
print(f'valid_time type: {df["valid_time"].dtype}')
print(f'Numeric columns: {df.select_dtypes(include=["number"]).columns.tolist()}')

try:
    df['valid_time'] = pd.to_datetime(df['valid_time'])
    print(f'\nvalid_time converted to datetime successfully')
    print(f'Date range: {df["valid_time"].min()} to {df["valid_time"].max()}')
except Exception as e:
    print(f'Error converting valid_time: {e}')

valid_time type: object
Numeric columns: ['latitude', 'longitude', 'tp', 'ro', 'number', 'expver', 'u10', 'v10', 't2m', 'swvl1', 'year', 'month', 'day', 'hour', 'dayofweek', 'dayofyear', 'wind_speed']

valid_time converted to datetime successfully
Date range: 2020-01-01 00:00:00 to 2024-12-31 21:00:00


In [22]:
print('='*70)
print('DATA TYPE VALIDATION')
print('='*70)

print(f'valid_time type: {df["valid_time"].dtype}')
print(f'Numeric columns: {df.select_dtypes(include=["number"]).columns.tolist()}')

try:
    df['valid_time'] = pd.to_datetime(df['valid_time'])
    print(f'\nvalid_time converted to datetime successfully')
    print(f'Date range: {df["valid_time"].min()} to {df["valid_time"].max()}')
except Exception as e:
    print(f'Error converting valid_time: {e}')

DATA TYPE VALIDATION
valid_time type: datetime64[ns]
Numeric columns: ['latitude', 'longitude', 'tp', 'ro', 'number', 'expver', 'u10', 'v10', 't2m', 'swvl1', 'year', 'month', 'day', 'hour', 'dayofweek', 'dayofyear', 'wind_speed']

valid_time converted to datetime successfully
Date range: 2020-01-01 00:00:00 to 2024-12-31 21:00:00

valid_time converted to datetime successfully
Date range: 2020-01-01 00:00:00 to 2024-12-31 21:00:00


## Step 7: Unique Values Check

In [23]:
unique_df = pd.DataFrame({
    'Column': df.columns,
    'Unique_Count': [df[col].nunique() for col in df.columns]
})

print(unique_df.to_string(index=False))

    Column  Unique_Count
valid_time         14616
  latitude             5
 longitude             5
        tp         10217
        ro          7100
    number             1
    expver             1
       u10        271041
       v10        254166
       t2m        107755
     swvl1        167660
      year             5
     month            12
       day            31
      hour             8
 dayofweek             7
 dayofyear           366
wind_speed        365398


## Step 8: Memory Usage

In [24]:
print(df.memory_usage(deep=True))
print(f'\nTotal memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB')

Index             132
valid_time    2923200
latitude      2923200
longitude     2923200
tp            2923200
ro            2923200
number        2923200
expver        2923200
u10           2923200
v10           2923200
t2m           2923200
swvl1         2923200
year          2923200
month         2923200
day           2923200
hour          2923200
dayofweek     2923200
dayofyear     2923200
wind_speed    2923200
dtype: int64

Total memory: 50.18 MB


## Summary

In [26]:
print(f'Rows: {len(df):,}')
print(f'Columns: {len(df.columns)}')
print(f'Missing values: {df.isnull().sum().sum()}')
print(f'Date range: {df["valid_time"].min()} to {df["valid_time"].max()}')
print(f'Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB')


Rows: 365,400
Columns: 18
Missing values: 0
Date range: 2020-01-01 00:00:00 to 2024-12-31 21:00:00
Memory: 50.18 MB
