In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np


In [2]:
# Step 2: Loading and Preprocessing the Dataset

data_url = 'https://raw.githubusercontent.com/dsrscientist/dataset3/main/weatherAUS.csv'
df = pd.read_csv(data_url)
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8420,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
8421,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
8422,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
8423,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [3]:
 #Drop unnecessary columns
df = df.drop(['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], axis=1)


In [4]:
# Drop rows with missing values
df = df.dropna()

# Encode categorical variables
encoder = LabelEncoder()
df['RainToday'] = encoder.fit_transform(df['RainToday'])
df['RainTomorrow'] = encoder.fit_transform(df['RainTomorrow'])


In [5]:
# Convert columns to float data type
numeric_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed',
                'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
                'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
df[numeric_cols] = df[numeric_cols].astype(float)


In [6]:
df[numeric_cols]=df[numeric_cols].fillna(df[numeric_cols].mean())

In [7]:
# Handling missing values in categorical columns
categorical_cols = ['RainToday', 'RainTomorrow']
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])


In [8]:
# Encode categorical variables
encoder = LabelEncoder()
df['RainToday'] = encoder.fit_transform(df['RainToday'])
df['RainTomorrow'] = encoder.fit_transform(df['RainTomorrow'])


In [9]:
# Step 3: Splitting the Data into Training and Testing Sets

X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Step 4: Training a Classification Model for Rain Tomorrow

classification_model = RandomForestClassifier()
classification_model.fit(X_train, y_train)

y_pred = classification_model.predict(X_test)

classification_accuracy = accuracy_score(y_test, y_pred)


In [11]:
# Step 5: Training a Regression Model for Rainfall Amount

X_rainfall = df.drop('Rainfall', axis=1)
y_rainfall = df['Rainfall']

X_rainfall_train, X_rainfall_test, y_rainfall_train, y_rainfall_test = train_test_split(X_rainfall, y_rainfall, test_size=0.2, random_state=42)

regression_model = LinearRegression()
regression_model.fit(X_rainfall_train, y_rainfall_train)

y_rainfall_pred = regression_model.predict(X_rainfall_test)

regression_error = mean_squared_error(y_rainfall_test, y_rainfall_pred)


In [12]:
# Step 6: Evaluating the Models

print("Classification Accuracy:", classification_accuracy)
print("Regression Error:", regression_error)


Classification Accuracy: 0.890038809831824
Regression Error: 55.329135739725785


In [13]:
# Step 7: Making Predictions

# You can use the trained models to make predictions on new/unseen data.
# For example, to predict whether it will rain tomorrow:

new_data = pd.DataFrame({
    'MinTemp': [10.0],
    'MaxTemp': [25.0],
    'Rainfall': [5.0],
    'Evaporation': [7.0],
    'Sunshine': [8.0],
    'WindGustSpeed': [40],
    'WindSpeed9am': [20],
    'WindSpeed3pm': [25],
    'Humidity9am': [60],
    'Humidity3pm': [40],
    'Pressure9am': [1015.0],
    'Pressure3pm': [1012.0],
    'Cloud9am': [4.0],
    'Cloud3pm': [5.0],
    'Temp9am': [15.0],
    'Temp3pm': [20.0],
    'RainToday': [1]
})



In [14]:
new_data['RainToday'] = encoder.transform(new_data['RainToday'])

rain_prediction = classification_model.predict(new_data)
print("Rain Tomorrow Prediction:", rain_prediction)


Rain Tomorrow Prediction: [0]


In [15]:
# To predict the amount of rainfall:

rainfall_prediction = regression_model.predict(new_data)
print("Rainfall Amount Prediction:", rainfall_prediction)


Rainfall Amount Prediction: [262.1232196]


Feature names unseen at fit time:
- Rainfall
Feature names seen at fit time, yet now missing:
- RainTomorrow

