In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
data = pd.read_csv(r"C:\Users\91984\OneDrive\Desktop\weather_data.csv")

In [5]:
print("Initial data shape:", data.shape)
print("Columns:", data.columns)
print(data.head())

Initial data shape: (5, 1)
Columns: Index(['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow'], dtype='object')
  Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0  2025-05-01,Sydney,14.0,22.0,0.0,4.0,9.0,N,35,N...                                                                                                                                                                                            
1  2025-05-02,Sydney,13.5,23.1,0.2,3.8,8.5,NE,40,...                                                                                                                                                                    

In [8]:
if 'RainTomorrow' not in data.columns:
    print("Available columns:", data.columns)
    print("Column 'RainTomorrow' is missing! Please check dataset.")
else:
    print("All good! 'RainTomorrow' is present.")

Available columns: Index(['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow'], dtype='object')
Column 'RainTomorrow' is missing! Please check dataset.


In [12]:
# Print available columns to debug
print("Available columns:", data.columns)

# Check if 'RainTomorrow' column exists
if 'RainTomorrow' in data.columns:
    # Convert target to binary (if column exists)
    data['RainTomorrow'] = data['RainTomorrow'].map({'Yes': 1, 'No': 0})

    # Drop rows with missing values in the target column
    data = data.dropna(subset=['RainTomorrow'])
else:
    print("Column 'RainTomorrow' is missing. Please check the dataset.")

Available columns: Index(['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow'], dtype='object')
Column 'RainTomorrow' is missing. Please check the dataset.


In [13]:
data = data.dropna()


In [44]:
# Print available columns for debugging
print("Available columns:", data.columns)

# Check if 'RainTomorrow' exists
if 'RainTomorrow' in data.columns:
    X = data.drop('RainTomorrow', axis=1)  # Drop target column from features
    y = data['RainTomorrow']  # Define target variable
else:
    print("Column 'RainTomorrow' is missing! Please check your dataset.")
    # Optional: Create the column if missing (assuming 'Rainfall' data exists)
    # data['RainTomorrow'] = np.nan  # Placeholder for missing values



Available columns: Index(['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow'], dtype='object')
Column 'RainTomorrow' is missing! Please check your dataset.


In [22]:

# Print all available columns in the dataset
print("Available columns in the dataset:", list(data.columns))

# Check if 'RainTomorrow' exists directly
if 'RainTomorrow' in data.columns:
    X = data.drop('RainTomorrow', axis=1)
    y = data['RainTomorrow']
else:
    # Try to identify similar column names
    potential_matches = [col for col in data.columns if 'rain' in col.lower()]

    if potential_matches:
        print(f"Possible matching column(s): {potential_matches}")
        # You may need to manually rename or adjust the column name
    else:
        raise ValueError("The dataset does not contain 'RainTomorrow'. Check your dataset for missing or misnamed columns.")

Available columns in the dataset: ['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow']
Possible matching column(s): ['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow']


In [30]:
# Print available columns
print("Available columns:", list(data.columns))

# Attempt to find a column similar to 'RainTomorrow'
expected_column = 'RainTomorrow'
matching_columns = [col for col in data.columns if 'rain' in col.lower()]

if matching_columns:
    print(f"Possible matching column(s): {matching_columns}")
    correct_column_name = matching_columns[0]  # Choose the closest match
    X = data.drop(correct_column_name, axis=1)  # Define features
    y = data[correct_column_name]  # Define target variable
else:
    raise ValueError("The dataset does not contain 'RainTomorrow'. Please check the column names.")

Available columns: ['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow']
Possible matching column(s): ['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow']


In [34]:
# Print available columns
print("Available columns:", list(data.columns))

# Attempt to find a column similar to 'RainTomorrow'
expected_column = 'RainTomorrow'
matching_columns = [col for col in data.columns if 'rain' in col.lower()]

if matching_columns:
    correct_column_name = matching_columns[0]  # Choose the closest match
    print(f"Using column '{correct_column_name}' as 'RainTomorrow'")
    X = data.drop(correct_column_name, axis=1)  # Define features
    y = data[correct_column_name]  # Define target variable
else:
    raise ValueError("The dataset does not contain 'RainTomorrow'. Please check the column names.")

Available columns: ['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow']
Using column 'Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow' as 'RainTomorrow'


In [39]:
# Print available columns
print("Available columns:", list(data.columns))

# Attempt to find a column similar to 'RainTomorrow'
expected_column = 'RainTomorrow'
matching_columns = [col for col in data.columns if 'rain' in col.lower()]

if matching_columns:
    correct_column_name = matching_columns[0]  # Choose the closest match
    print(f"Using column '{correct_column_name}' as 'RainTomorrow'")
    X = data.drop(correct_column_name, axis=1)  # Define features
    y = data[correct_column_name]  # Define target variable
else:
    raise ValueError("The dataset does not contain 'RainTomorrow'. Please check the column names.")

Available columns: ['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow']
Using column 'Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow' as 'RainTomorrow'


In [43]:
import matplotlib.pyplot as plt

# Check available columns
print("Available columns:", list(data.columns))

# Find a column similar to 'RainTomorrow'
expected_column = 'RainTomorrow'
matching_columns = [col for col in data.columns if expected_column.lower() in col.lower()]

if matching_columns:
    correct_column_name = matching_columns[0]  # Choose the closest match
    print(f"Using column '{correct_column_name}' as 'RainTomorrow'")
    X = data.drop(correct_column_name, axis=1)  # Features
    y = data[correct_column_name]  # Target variable
else:
    raise ValueError("The dataset does not contain 'RainTomorrow'. Please check the column names.")

# Select numerical feature(s) for plotting
numerical_features = X.select_dtypes(include=['number']).columns

if len(numerical_features) > 0:
    # Plot first numerical feature against the target variable
    plt.figure(figsize=(6, 4))
    plt.scatter(X[numerical_features[0]], y, alpha=0.5)
    plt.xlabel(numerical_features[0])
    plt.ylabel('RainTomorrow')
    plt.title(f'Scatter Plot of {numerical_features[0]} vs RainTomorrow')
    plt.show()
else:
    print("No numerical features available for plotting.")

Available columns: ['Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow']
Using column 'Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow' as 'RainTomorrow'
No numerical features available for plotting.
