In [1]:
import pandas as pd
from scipy.spatial import cKDTree

# Load the CSV file
file_path = r"C:\Users\Dell\Desktop\data_1997_1998.csv"
df = pd.read_csv(file_path)

# Merge 'year', 'month', and 'day' columns into a single 'date' column
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

# Extract necessary columns
columns = ['date', 'latitude', 'longitude', 'zon.winds', 'mer.winds', 'humidity', 'air temp.']
df_subset = df[columns].copy()

# Function to fill NaN values with the nearest neighbor's value on the same date
def fill_na_with_nearest(df, target_column):
    # Get rows with and without NaNs in the target column
    df_with_values = df.dropna(subset=[target_column])
    df_na = df[df[target_column].isna()]

    # Iterate over each unique date
    for date in df['date'].unique():
        same_date_with_values = df_with_values[df_with_values['date'] == date]
        same_date_na = df_na[df_na['date'] == date]

        if not same_date_with_values.empty and not same_date_na.empty:
            kdtree = cKDTree(same_date_with_values[['latitude', 'longitude']])

            for idx, row in same_date_na.iterrows():
                distance, nn_idx = kdtree.query([row['latitude'], row['longitude']])
                nn_value = same_date_with_values.iloc[nn_idx][target_column]
                df.at[idx, target_column] = nn_value

# Fill NaN values in the specified columns
for column in ['zon.winds', 'mer.winds', 'humidity', 'air temp.']:
    fill_na_with_nearest(df_subset, column)

# Save the updated dataframe to a new CSV file
output_path = r"C:\Users\Dell\Desktop\python_project\data_1997_1998_filled.csv"
df_subset.to_csv(output_path, index=False)

print("NaN values filled and new file saved.")


NaN values filled and new file saved.


In [2]:
n = pd.read_csv(r"C:\Users\Dell\Desktop\python_project\data_1997_1998_filled.csv")
n.apply(pd.isnull).sum()

date         0
latitude     0
longitude    0
zon.winds    0
mer.winds    0
humidity     0
air temp.    0
dtype: int64