In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('C:/Projects/MasterSc/ML/HW/HW_2/Rain in Australia/weatherAUS.csv')
data = df.copy()

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [5]:
data.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,143975.0,144199.0,142199.0,82670.0,75625.0,135197.0,143693.0,142398.0,142806.0,140953.0,130395.0,130432.0,89572.0,86102.0,143693.0,141851.0
mean,12.194034,23.221348,2.360918,5.468232,7.611178,40.03523,14.043426,18.662657,68.880831,51.539116,1017.64994,1015.255889,4.447461,4.50993,16.990631,21.68339
std,6.398495,7.119049,8.47806,4.193704,3.785483,13.607062,8.915375,8.8098,19.029164,20.795902,7.10653,7.037414,2.887159,2.720357,6.488753,6.93665
min,-8.5,-4.8,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4
25%,7.6,17.9,0.0,2.6,4.8,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6
50%,12.0,22.6,0.0,4.8,8.4,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1
75%,16.9,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7


In [6]:
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month

data = data.drop(columns=['Date'])

In [7]:
print()
print(data.isnull().sum().sort_values(ascending=False))


Sunshine         69835
Evaporation      62790
Cloud3pm         59358
Cloud9am         55888
Pressure9am      15065
Pressure3pm      15028
WindDir9am       10566
WindGustDir      10326
WindGustSpeed    10263
Humidity3pm       4507
WindDir3pm        4228
Temp3pm           3609
RainTomorrow      3267
RainToday         3261
Rainfall          3261
WindSpeed3pm      3062
Humidity9am       2654
WindSpeed9am      1767
Temp9am           1767
MinTemp           1485
MaxTemp           1261
Location             0
Year                 0
Month                0
dtype: int64


In [8]:
missing = data.isnull().sum()
missing_percent = (missing / len(df)) * 100
print()
print(missing_percent[missing_percent > 0].sort_values(ascending=False))


Sunshine         48.009762
Evaporation      43.166506
Cloud3pm         40.807095
Cloud9am         38.421559
Pressure9am      10.356799
Pressure3pm      10.331363
WindDir9am        7.263853
WindGustDir       7.098859
WindGustSpeed     7.055548
Humidity3pm       3.098446
WindDir3pm        2.906641
Temp3pm           2.481094
RainTomorrow      2.245978
Rainfall          2.241853
RainToday         2.241853
WindSpeed3pm      2.105046
Humidity9am       1.824557
WindSpeed9am      1.214767
Temp9am           1.214767
MinTemp           1.020899
MaxTemp           0.866905
dtype: float64


In [9]:
data = data.drop(columns=['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Location'])

In [10]:
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

categorical_cols.remove('RainTomorrow')

In [11]:
numerical_cols.append('Year')
categorical_cols.append('Month')

features = numerical_cols + categorical_cols

In [12]:
max_year = data['Year'].max()
data_train = data[data['Year']<max_year]
data_test = data[data['Year'] == max_year]

In [13]:
X_train = data_train[features]
X_test = data_test[features]
y_train = data_train['RainTomorrow']
y_test = data_test['RainTomorrow']

In [14]:
train_mask = y_train.notnull()
X_train = X_train[train_mask]
y_train = y_train[train_mask]

test_mask = y_test.notnull()
X_test = X_test[test_mask]
y_test = y_test[test_mask]

In [15]:
print()
print('Пропуски у y_train:', y_train.isnull().sum())
print('Пропуски у y_test:', y_test.isnull().sum())


Пропуски у y_train: 0
Пропуски у y_test: 0


In [16]:
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print()
print('First step')
print(classification_report(y_test, y_pred))


First step
              precision    recall  f1-score   support

          No       0.87      0.96      0.91      6703
         Yes       0.72      0.44      0.54      1763

    accuracy                           0.85      8466
   macro avg       0.79      0.70      0.73      8466
weighted avg       0.84      0.85      0.83      8466



In [18]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=37)),  # Додаємо SMOTE після препроцесінгу
    ('classifier', LogisticRegression(solver='liblinear', random_state=37))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print()
print('Second step')
print(classification_report(y_test, y_pred))




Second step
              precision    recall  f1-score   support

          No       0.92      0.82      0.86      6703
         Yes       0.51      0.71      0.59      1763

    accuracy                           0.80      8466
   macro avg       0.71      0.77      0.73      8466
weighted avg       0.83      0.80      0.81      8466



Conclusions:

* SMOTE helps address class imbalance, significantly improving the recall for the minority class Yes.

* This is crucial in tasks where it is better to predict rain, even if occasionally incorrect, than to miss it altogether.

* The reduction in accuracy and precision for the Yes class is an acceptable trade-off when the goal is to increase the model’s sensitivity to rain.

* In practice, this is much more valuable when the model is used in real-world forecasting, where the error of “not predicting rain when it actually rains” is more critical than “predicting rain when it doesn't occur.”