In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import joblib


# 1. –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–Ω—è –¥–∞–Ω–∏—Ö

data = pd.read_csv("weatherAUS.csv")

data = data.dropna(subset=["RainTomorrow"])


# 2. –í–∏–∑–Ω–∞—á–µ–Ω–Ω—è –∫–æ–ª–æ–Ω–æ–∫

target_col = "RainTomorrow"
categorical_cols = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
numeric_cols = [
    'MinTemp','MaxTemp','Rainfall','Evaporation','Sunshine',
    'WindGustSpeed','WindSpeed9am','WindSpeed3pm',
    'Humidity9am','Humidity3pm',
    'Pressure9am','Pressure3pm',
    'Cloud9am','Cloud3pm',
    'Temp9am','Temp3pm'
]
input_cols = categorical_cols + numeric_cols

X = data[input_cols]
y = (data[target_col] == "Yes").astype(int)


# 3. –ü—Ä–µ–ø—Ä–æ—Ü–µ—Å–∏–Ω–≥

imputer = SimpleImputer()
scaler = MinMaxScaler()
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# –Ü–º–ø—É—Ç–∞—Ü—ñ—è
X[numeric_cols] = imputer.fit_transform(X[numeric_cols])

# –ú–∞—Å—à—Ç–∞–±—É–≤–∞–Ω–Ω—è —á–∏—Å–ª–æ–≤–∏—Ö
X_scaled = scaler.fit_transform(X[numeric_cols])
X_scaled = pd.DataFrame(X_scaled, columns=numeric_cols)

# –ö–æ–¥—É–≤–∞–Ω–Ω—è –∫–∞—Ç–µ–≥–æ—Ä—ñ–∞–ª—å–Ω–∏—Ö
X_encoded = encoder.fit_transform(X[categorical_cols])
encoded_cols = encoder.get_feature_names_out(categorical_cols)
X_encoded = pd.DataFrame(X_encoded, columns=encoded_cols)

# –û–±‚Äô—î–¥–Ω–∞–Ω–Ω—è
X_ready = pd.concat([X_scaled, X_encoded], axis=1)


# 4. –¢—Ä–µ–Ω—É–≤–∞–Ω–Ω—è –º–æ–¥–µ–ª–µ–π

X_train, X_test, y_train, y_test = train_test_split(X_ready, y, test_size=0.2, random_state=42)

# --- Logistic Regression
log_model = LogisticRegression(max_iter=500, solver='liblinear')
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
y_prob_log = log_model.predict_proba(X_test)[:, 1]

# --- Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]


# 5. –ü–æ—Ä—ñ–≤–Ω—è–Ω–Ω—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ñ–≤

print("üìà Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("ROC AUC:", roc_auc_score(y_test, y_prob_log))
print(classification_report(y_test, y_pred_log))

print("üå≤ Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf))
print(classification_report(y_test, y_pred_rf))


# 6. –ó–±–µ—Ä–µ–∂–µ–Ω–Ω—è –∫—Ä–∞—â–æ—ó –º–æ–¥–µ–ª—ñ 

# –Ø–∫—â–æ Random Forest –º–∞—î –≤–∏—â–∏–π ROC AUC:
if roc_auc_score(y_test, y_prob_rf) > roc_auc_score(y_test, y_prob_log):
    best_model = rf_model
    model_name = "aussie_rain_rf.joblib"
    print("\n‚úÖ Random Forest –æ–±—Ä–∞–Ω–æ —è–∫ –∫—Ä–∞—â—É –º–æ–¥–µ–ª—å.")
else:
    best_model = log_model
    model_name = "aussie_rain_log.joblib"
    print("\n‚úÖ Logistic Regression –∑–∞–ª–∏—à–∏–ª–∞—Å—å –∫—Ä–∞—â–æ—é.")

joblib.dump({
    'model': best_model,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols
}, model_name)
print(f"üíæ –ú–æ–¥–µ–ª—å –∑–±–µ—Ä–µ–∂–µ–Ω–æ —è–∫ {model_name}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = imputer.fit_transform(X[numeric_cols])


üìà Logistic Regression
Accuracy: 0.846337775589859
ROC AUC: 0.8681568772465998
              precision    recall  f1-score   support

           0       0.87      0.94      0.90     22098
           1       0.72      0.51      0.60      6341

    accuracy                           0.85     28439
   macro avg       0.79      0.73      0.75     28439
weighted avg       0.84      0.85      0.84     28439

üå≤ Random Forest
Accuracy: 0.8530187418685607
ROC AUC: 0.883297686900558
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     22098
           1       0.76      0.50      0.60      6341

    accuracy                           0.85     28439
   macro avg       0.81      0.73      0.76     28439
weighted avg       0.84      0.85      0.84     28439


‚úÖ Random Forest –æ–±—Ä–∞–Ω–æ —è–∫ –∫—Ä–∞—â—É –º–æ–¥–µ–ª—å.
üíæ –ú–æ–¥–µ–ª—å –∑–±–µ—Ä–µ–∂–µ–Ω–æ —è–∫ aussie_rain_rf.joblib
