In [20]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import hamming_loss, accuracy_score, jaccard_score

from sklearn.svm import SVC

In [21]:
dataset = pd.read_csv('/content/weatherAUS.csv')

In [22]:
df= pd.DataFrame(dataset)

In [29]:
df_main = df.copy()

# Handling Missing values

In [30]:
df.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [31]:
df.duplicated().sum()

0

In [32]:
df.isna().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [34]:
df.duplicated().sum()

0

In [35]:
df = df.drop_duplicates()

In [37]:
df = df.dropna()

In [38]:
df.isna().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [39]:
df.isnull().mean() * 100

Date             0.0
Location         0.0
MinTemp          0.0
MaxTemp          0.0
Rainfall         0.0
Evaporation      0.0
Sunshine         0.0
WindGustDir      0.0
WindGustSpeed    0.0
WindDir9am       0.0
WindDir3pm       0.0
WindSpeed9am     0.0
WindSpeed3pm     0.0
Humidity9am      0.0
Humidity3pm      0.0
Pressure9am      0.0
Pressure3pm      0.0
Cloud9am         0.0
Cloud3pm         0.0
Temp9am          0.0
Temp3pm          0.0
RainToday        0.0
RainTomorrow     0.0
dtype: float64

In [41]:
#final shape after duplicate and nan values
df.shape


(56420, 23)

In [42]:
df.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0,56420.0
mean,13.46477,24.219206,2.130397,5.503135,7.735626,40.877366,15.667228,19.786778,65.874123,49.601985,1017.239505,1014.79558,4.241705,4.326515,18.204961,22.710333
std,6.416689,6.970676,7.014822,3.696282,3.758153,13.335232,8.317005,8.51018,18.513289,20.19704,6.909357,6.870892,2.797162,2.647251,6.567991,6.836543
min,-6.7,4.1,0.0,0.0,0.0,9.0,2.0,2.0,0.0,0.0,980.5,977.1,0.0,0.0,-0.7,3.7
25%,8.6,18.7,0.0,2.8,5.0,31.0,9.0,13.0,55.0,35.0,1012.7,1010.1,1.0,2.0,13.1,17.4
50%,13.2,23.9,0.0,5.0,8.6,39.0,15.0,19.0,67.0,50.0,1017.2,1014.7,5.0,5.0,17.8,22.4
75%,18.4,29.7,0.6,7.4,10.7,48.0,20.0,26.0,79.0,63.0,1021.8,1019.4,7.0,7.0,23.3,27.9
max,31.4,48.1,206.2,81.2,14.5,124.0,67.0,76.0,100.0,100.0,1040.4,1038.9,8.0,9.0,39.4,46.1


In [45]:
df['Date']  #we need to convert it to datetime

df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [47]:
df.dtypes                                 #new created columns would be int32

Date             datetime64[ns]
Location                 object
MinTemp                 float64
MaxTemp                 float64
Rainfall                float64
Evaporation             float64
Sunshine                float64
WindGustDir              object
WindGustSpeed           float64
WindDir9am               object
WindDir3pm               object
WindSpeed9am            float64
WindSpeed3pm            float64
Humidity9am             float64
Humidity3pm             float64
Pressure9am             float64
Pressure3pm             float64
Cloud9am                float64
Cloud3pm                float64
Temp9am                 float64
Temp3pm                 float64
RainToday                object
RainTomorrow             object
Year                      int32
Month                     int32
Day                       int32
dtype: object

In [65]:
#final shape after adding new features
df.shape

(56420, 30)

# Outliers

In [66]:


# numerical columns for outlier detection
numerical_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
                     'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
                     'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
                     'Temp9am', 'Temp3pm']

# Calculate Z-scores for numerical columns
z_scores = df[numerical_columns].apply(lambda x: np.abs((x - x.mean()) / x.std()))

#  threshold for outliers
threshold = 3

# outliers for each column
outliers_by_column = z_scores > threshold

# count outliers in each column
num_outliers_by_column = outliers_by_column.sum()

# Print columns with outliers and the number of outliers in each column
print("Columns with outliers:")
for col, num_outliers in num_outliers_by_column.items():
    if num_outliers > 0:
        print(f"{col}: {num_outliers} outliers")


Columns with outliers:
MinTemp: 2 outliers
MaxTemp: 17 outliers
Rainfall: 1162 outliers
Evaporation: 495 outliers
WindGustSpeed: 636 outliers
WindSpeed9am: 629 outliers
WindSpeed3pm: 377 outliers
Humidity9am: 217 outliers
Pressure9am: 234 outliers
Pressure3pm: 213 outliers
Temp9am: 5 outliers
Temp3pm: 24 outliers


In [49]:
 from scipy.stats.mstats import winsorize

# Apply winsorization to 'Rainfall' column
df['Rainfall_winsorized'] = winsorize(df['Rainfall'], limits=[0.05, 0.05])

 # Compare the number of outliers before and after winsorization
outliers_before = df['Rainfall'].size - df['Rainfall'].between(df['Rainfall'].quantile(0.05), df['Rainfall'].quantile(0.95)).sum()
outliers_after = df['Rainfall_winsorized'].size - df['Rainfall_winsorized'].between(df['Rainfall_winsorized'].quantile(0.05), df['Rainfall_winsorized'].quantile(0.95)).sum()

print("Number of outliers before winsorization:", outliers_before)
print("Number of outliers after winsorization:", outliers_after)


Number of outliers before winsorization: 2798
Number of outliers after winsorization: 0


  arr.partition(
  arr.partition(


# Feature engineering

In [51]:
#creating new columns to determine temperature, humidity and pressure changes
df['TempChange'] = df['Temp3pm'] - df['Temp9am']
df['HumidityChange'] = df['Humidity3pm'] - df['Humidity9am']
df['PressureChange'] = df['Pressure3pm'] - df['Pressure9am']

In [52]:
X = df.drop(columns=['RainToday', 'RainTomorrow','Date','Temp3pm','Temp9am','Humidity3pm','Humidity9am','Pressure3pm','Pressure9am']).values
Y = df[['RainToday', 'RainTomorrow']].values

In [53]:
print(X[:, [0, 6, 8, 9]])
print(Y)

[['Cobar' 'SSW' 'ENE' 'SW']
 ['Cobar' 'S' 'SSE' 'SSE']
 ['Cobar' 'NNE' 'NNE' 'NNW']
 ...
 ['Darwin' 'E' 'E' 'W']
 ['Darwin' 'ESE' 'SE' 'NNW']
 ['Darwin' 'ENE' 'ENE' 'NNW']]
[['No' 'No']
 ['No' 'No']
 ['No' 'No']
 ...
 ['No' 'No']
 ['No' 'No']
 ['No' 'No']]


In [54]:
# Apply label encoding to the selected columns
label_encoder = LabelEncoder()
X[:, 0] = label_encoder.fit_transform(X[:, 0])
X[:, 6] = label_encoder.fit_transform(X[:, 6])
X[:, 8] = label_encoder.fit_transform(X[:, 8])
X[:, 9] = label_encoder.fit_transform(X[:, 9])
print(X[:, [0, 6, 8, 9]])
# Apply one-hot encoding to the label-encoded columns
onehot_encoder = OneHotEncoder(categories='auto', sparse_output=False)
X_encoded = onehot_encoder.fit_transform(X[:, [0, 6, 8, 9]])
print(X_encoded)
# Replace the original columns with the one-hot encoded columns
X = np.delete(X, [0, 6, 8, 9], axis=1)
X = np.concatenate((X, X_encoded), axis=1)
print(X)

[[4 11 1 12]
 [4 8 10 10]
 [4 5 5 6]
 ...
 [6 0 0 13]
 [6 2 9 6]
 [6 1 1 6]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[17.9 35.2 0.0 ... 0.0 0.0 0.0]
 [18.4 28.9 0.0 ... 0.0 0.0 0.0]
 [19.4 37.6 0.0 ... 0.0 0.0 0.0]
 ...
 [20.7 32.8 0.0 ... 1.0 0.0 0.0]
 [19.5 31.8 0.0 ... 0.0 0.0 0.0]
 [20.2 31.7 0.0 ... 0.0 0.0 0.0]]


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 42, test_size = 0.3)

In [56]:
clf_entropy = DecisionTreeClassifier(criterion='entropy', random_state=100,
                                   max_depth=3, min_samples_leaf=5)

In [57]:
clf_entropy.fit(X_train, y_train)

In [58]:
y_pred_en = clf_entropy.predict(X_test)

In [59]:
print(y_pred_en)
print(y_test)

[['No' 'No']
 ['No' 'No']
 ['Yes' 'No']
 ...
 ['No' 'No']
 ['Yes' 'No']
 ['Yes' 'No']]
[['No' 'Yes']
 ['No' 'No']
 ['Yes' 'No']
 ...
 ['No' 'No']
 ['Yes' 'No']
 ['Yes' 'No']]


This code calculates and prints the Hamming Loss and Jaccard Score for each output column (label) in a multi-label classification problem, providing insights into the performance of the classifier for each individual label.

In [64]:
# Calculate Hamming Loss for each output column
hamming_loss_values = [round(hamming_loss(y_test[:, i], y_pred_en[:, i]) * 100, 2) for i in range(y_test.shape[1])]
print("Hamming Loss:", hamming_loss_values)

# Calculate Jaccard Score for each output column
jaccard_score_values = [round(jaccard_score(y_test[:, i], y_pred_en[:, i], average='micro') * 100, 2) for i in range(y_test.shape[1])]
print("Jaccard Score:", jaccard_score_values)


Hamming Loss: [0.01, 17.8]
Jaccard Score: [99.99, 69.78]
