In [4]:
import pandas as pd

# 1. Load dataset
df = pd.read_csv("usa_rain_prediction_dataset_2024_2025.csv")

# 2. Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# 3. Basic overview
print("Columns:", df.columns.tolist())
print("Shape:", df.shape)
print("Missing values:\n", df.isnull().sum())

# 4. Data type conversion (if needed)
df['date'] = pd.to_datetime(df['date'])

# 5. Drop duplicates if any
df.drop_duplicates(inplace=True)

# 6. Check for outliers (optional: could add visual EDA)
numerics = df.select_dtypes(include=['float64', 'int64']).columns
print("\nBasic stats:\n", df[numerics].describe())

# 7. Save cleaned file
df.to_csv("cleaned_usa_rainfall.csv", index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_usa_rainfall.csv'")


Columns: ['date', 'location', 'temperature', 'humidity', 'wind_speed', 'precipitation', 'cloud_cover', 'pressure', 'rain_tomorrow']
Shape: (73100, 9)
Missing values:
 date             0
location         0
temperature      0
humidity         0
wind_speed       0
precipitation    0
cloud_cover      0
pressure         0
rain_tomorrow    0
dtype: int64

Basic stats:
         temperature      humidity    wind_speed  precipitation   cloud_cover  \
count  73100.000000  73100.000000  73100.000000   73100.000000  73100.000000   
mean      65.182270     59.875041     15.017946       0.390635     54.942807   
std       20.205793     23.066115      8.668729       0.474833     25.982487   
min       30.000766     20.000272      0.000712       0.000000     10.000856   
25%       47.678968     39.800732      7.485182       0.000000     32.318668   
50%       65.294949     59.887840     15.102495       0.196909     55.011121   
75%       82.636570     79.835990     22.551794       0.673177     77.4124