In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor

In [2]:
# Load the dataset
df = pd.read_csv('/content/weatherAUS.csv')

# Drop rows with missing values
df = df.dropna()


In [3]:
# Convert categorical columns to numerical using Label Encoding
le = LabelEncoder()
df['Location'] = le.fit_transform(df['Location'])
df['WindGustDir'] = le.fit_transform(df['WindGustDir'])
df['WindDir9am'] = le.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = le.fit_transform(df['WindDir3pm'])
df['RainToday'] = le.fit_transform(df['RainToday'])
df['RainTomorrow'] = le.fit_transform(df['RainTomorrow'])


In [4]:
# Features and target for classification (Predict RainTomorrow)
X_class = df.drop(['RainTomorrow', 'Date'], axis=1)
y_class = df['RainTomorrow']

# Split the data into training and test sets for classification
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.3, random_state=42)


In [5]:
# Features and target for regression (Predict Rainfall)
X_reg = df.drop(['Rainfall', 'Date'], axis=1)
y_reg = df['Rainfall']

# Split the data into training and test sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)


# Logistic Regression for classification

In [6]:
# Logistic Regression for classification
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_class, y_train_class)
y_pred_class = logreg.predict(X_test_class)

In [7]:
# Evaluate Logistic Regression
logreg_accuracy = accuracy_score(y_test_class, y_pred_class)
logreg_conf_matrix = confusion_matrix(y_test_class, y_pred_class)
logreg_class_report = classification_report(y_test_class, y_pred_class)

# Linear Regression for regression


In [8]:
# Linear Regression for regression
linreg = LinearRegression()
linreg.fit(X_train_reg, y_train_reg)
y_pred_reg = linreg.predict(X_test_reg)

In [9]:
# Evaluate Linear Regression
linreg_mse = mean_squared_error(y_test_reg, y_pred_reg)
linreg_r2 = r2_score(y_test_reg, y_pred_reg)

# Random Forest Classifier



In [10]:
# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, n_jobs=-1, verbose=1)
rf_grid_search.fit(X_train_class, y_train_class)
best_rf_classifier = rf_grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [11]:
# Evaluate Random Forest Classifier
y_pred_rf_class = best_rf_classifier.predict(X_test_class)
rf_accuracy = accuracy_score(y_test_class, y_pred_rf_class)
rf_conf_matrix = confusion_matrix(y_test_class, y_pred_rf_class)
rf_class_report = classification_report(y_test_class, y_pred_rf_class)

# XGBoost Classifier


In [12]:
# XGBoost Classifier
xgb_classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}
xgb_grid_search = GridSearchCV(xgb_classifier, xgb_param_grid, cv=5, n_jobs=-1, verbose=1)
xgb_grid_search.fit(X_train_class, y_train_class)
best_xgb_classifier = xgb_grid_search.best_estimator_


Fitting 5 folds for each of 16 candidates, totalling 80 fits


Parameters: { "use_label_encoder" } are not used.



In [13]:
# Evaluate XGBoost Classifier
y_pred_xgb_class = best_xgb_classifier.predict(X_test_class)
xgb_accuracy = accuracy_score(y_test_class, y_pred_xgb_class)
xgb_conf_matrix = confusion_matrix(y_test_class, y_pred_xgb_class)
xgb_class_report = classification_report(y_test_class, y_pred_xgb_class)

# Random Forest Regressor


In [14]:
# Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)
rf_reg_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_reg_grid_search = GridSearchCV(rf_regressor, rf_reg_param_grid, cv=5, n_jobs=-1, verbose=1)
rf_reg_grid_search.fit(X_train_reg, y_train_reg)
best_rf_regressor = rf_reg_grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [15]:
# Evaluate Random Forest Regressor
y_pred_rf_reg = best_rf_regressor.predict(X_test_reg)
rf_mse = mean_squared_error(y_test_reg, y_pred_rf_reg)
rf_r2 = r2_score(y_test_reg, y_pred_rf_reg)

# XGBoost Regressor


In [16]:
# XGBoost Regressor
xgb_regressor = XGBRegressor(random_state=42)
xgb_reg_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}
xgb_reg_grid_search = GridSearchCV(xgb_regressor, xgb_reg_param_grid, cv=5, n_jobs=-1, verbose=1)
xgb_reg_grid_search.fit(X_train_reg, y_train_reg)
best_xgb_regressor = xgb_reg_grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [17]:
# Evaluate XGBoost Regressor
y_pred_xgb_reg = best_xgb_regressor.predict(X_test_reg)
xgb_mse = mean_squared_error(y_test_reg, y_pred_xgb_reg)
xgb_r2 = r2_score(y_test_reg, y_pred_xgb_reg)

# Results


In [19]:
print("Logistic Regression:")
print(f"Accuracy: {logreg_accuracy}")
print(f"Confusion Matrix:\n{logreg_conf_matrix}")
print(f"Classification Report:\n{logreg_class_report}\n")


Logistic Regression:
Accuracy: 0.8434476693051891
Confusion Matrix:
[[798  60]
 [118 161]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       858
           1       0.73      0.58      0.64       279

    accuracy                           0.84      1137
   macro avg       0.80      0.75      0.77      1137
weighted avg       0.84      0.84      0.84      1137




In [20]:
print("Linear Regression:")
print(f"MSE: {linreg_mse}")
print(f"R2 Score: {linreg_r2}\n")

Linear Regression:
MSE: 50.96537993256489
R2 Score: 0.30038217321858685



In [21]:
print("Random Forest Classifier:")
print(f"Accuracy: {rf_accuracy}")
print(f"Confusion Matrix:\n{rf_conf_matrix}")
print(f"Classification Report:\n{rf_class_report}\n")

Random Forest Classifier:
Accuracy: 0.8865435356200527
Confusion Matrix:
[[825  33]
 [ 96 183]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       858
           1       0.85      0.66      0.74       279

    accuracy                           0.89      1137
   macro avg       0.87      0.81      0.83      1137
weighted avg       0.88      0.89      0.88      1137




In [22]:
print("XGBoost Classifier:")
print(f"Accuracy: {xgb_accuracy}")
print(f"Confusion Matrix:\n{xgb_conf_matrix}")
print(f"Classification Report:\n{xgb_class_report}\n")

XGBoost Classifier:
Accuracy: 0.8953386103781882
Confusion Matrix:
[[820  38]
 [ 81 198]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       858
           1       0.84      0.71      0.77       279

    accuracy                           0.90      1137
   macro avg       0.87      0.83      0.85      1137
weighted avg       0.89      0.90      0.89      1137




In [23]:
print("Random Forest Regressor:")
print(f"MSE: {rf_mse}")
print(f"R2 Score: {rf_r2}\n")

Random Forest Regressor:
MSE: 42.56149918849253
R2 Score: 0.4157448918028017



In [24]:
print("XGBoost Regressor:")
print(f"MSE: {xgb_mse}")
print(f"R2 Score: {xgb_r2}\n")

XGBoost Regressor:
MSE: 49.44685806623999
R2 Score: 0.32122740128210736

