In [1]:
import pandas as pd
from scipy.spatial import cKDTree

# Load the CSV file
file_path = r"C:\Users\Dell\Downloads\train (1).csv"
df = pd.read_csv(file_path)

# Merge 'year', 'month', and 'day' columns into a single 'date' column
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

# Extract necessary columns
columns = ['date', 'latitude', 'longitude', 'zon.winds', 'mer.winds', 'humidity', 'air temp.']
df_subset = df[columns].copy()

# Function to fill NaN values with the nearest neighbor's value on the same date
def fill_na_with_nearest(df, target_column):
    # Get rows with and without NaNs in the target column
    df_with_values = df.dropna(subset=[target_column])
    df_na = df[df[target_column].isna()]

    # Iterate over each unique date
    for date in df['date'].unique():
        same_date_with_values = df_with_values[df_with_values['date'] == date]
        same_date_na = df_na[df_na['date'] == date]

        if not same_date_with_values.empty and not same_date_na.empty:
            kdtree = cKDTree(same_date_with_values[['latitude', 'longitude']])

            for idx, row in same_date_na.iterrows():
                distance, nn_idx = kdtree.query([row['latitude'], row['longitude']])
                nn_value = same_date_with_values.iloc[nn_idx][target_column]
                df.at[idx, target_column] = nn_value

# Fill NaN values in the specified columns
for column in ['zon.winds', 'mer.winds', 'humidity', 'air temp.']:
    fill_na_with_nearest(df_subset, column)

# Save the updated dataframe to a new CSV file
output_path = r"C:\Users\Dell\Desktop\python_project\train_fill.csv"
df_subset.to_csv(output_path, index=False)

print("NaN values filled and new file saved.")

NaN values filled and new file saved.


In [2]:
data = pd.read_csv(r"C:\Users\Dell\Desktop\python_project\train_fill.csv")
data.apply(pd.isnull).sum()

date             0
latitude         0
longitude        0
zon.winds       77
mer.winds       77
humidity     15971
air temp.       33
dtype: int64

In [1]:
import pandas as pd
from scipy.spatial import cKDTree

# Load the dataset
file_path = r"C:\Users\Dell\Desktop\python_project\train_fill.csv"
data = pd.read_csv(file_path)

# Convert date column to datetime
data['date'] = pd.to_datetime(data['date'])

# Separate the columns with NaN values
nan_cols = ['zon.winds', 'mer.winds', 'humidity', 'air temp.']

# Function to fill NaN values using nearest neighbor method
def fill_na_using_nearest(df, target_cols):
    for col in target_cols:
        # Find indices of rows with NaN values in the current column
        nan_idx = df[df[col].isna()].index
        
        for idx in nan_idx:
            row = df.loc[idx]
            # Filter data for the same date
            same_date_data = df[df['date'] == row['date']]
            # Remove rows with NaN in the target column
            same_date_data = same_date_data.dropna(subset=[col])
            
            if not same_date_data.empty:
                # Build a k-d tree for fast spatial search
                tree = cKDTree(same_date_data[['latitude', 'longitude']])
                dist, nearest_idx = tree.query([row['latitude'], row['longitude']])
                # Get the value from the nearest neighbor
                nearest_value = same_date_data.iloc[nearest_idx][col]
                # Fill the NaN value with the nearest value
                df.at[idx, col] = nearest_value

# Apply the function to fill NaN values using same date
fill_na_using_nearest(data, nan_cols)

# Function to fill NaN values using nearest neighbor method regardless of date
def fill_na_using_nearest_any_date(df, target_cols):
    for col in target_cols:
        # Find indices of rows with NaN values in the current column
        nan_idx = df[df[col].isna()].index
        
        for idx in nan_idx:
            row = df.loc[idx]
            # Remove rows with NaN in the target column
            same_date_data = df.dropna(subset=[col])
            
            if not same_date_data.empty:
                # Build a k-d tree for fast spatial search
                tree = cKDTree(same_date_data[['latitude', 'longitude']])
                dist, nearest_idx = tree.query([row['latitude'], row['longitude']])
                # Get the value from the nearest neighbor
                nearest_value = same_date_data.iloc[nearest_idx][col]
                # Fill the NaN value with the nearest value
                df.at[idx, col] = nearest_value

# Apply the function to fill NaN values using any date
fill_na_using_nearest_any_date(data, nan_cols)

# Interpolate the remaining NaN values
data[nan_cols] = data[nan_cols].interpolate(method='linear', axis=0)

# Check if all NaN values are filled
nan_counts = data.isna().sum()

# Save the filled data to a new CSV file
output_path =r"C:\Users\Dell\Desktop\train_fill_new.csv"
data.to_csv(output_path, index=False)

nan_counts, output_path


(date         0
 latitude     0
 longitude    0
 zon.winds    0
 mer.winds    0
 humidity     0
 air temp.    0
 dtype: int64,
 'C:\\Users\\Dell\\Desktop\\train_fill_new.csv')