In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import warnings
import random
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df=pd.read_csv('weatherAUS.csv')

In [3]:
df.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [5]:
df.shape

(145460, 23)

In [6]:
df.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
y = df["RainTomorrow"].value_counts()
y

RainTomorrow
No     110316
Yes     31877
Name: count, dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
df.isnull().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [11]:
y = df["RainTomorrow"].value_counts()
y

RainTomorrow
No     43993
Yes    12427
Name: count, dtype: int64

In [12]:
df.shape

(56420, 23)

In [13]:
print(df.dtypes)

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object


In [14]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
6049,2009-01-01,Cobar,17.9,35.2,0.0,12.0,12.3,SSW,48.0,ENE,...,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,No,No
6050,2009-01-02,Cobar,18.4,28.9,0.0,14.8,13.0,S,37.0,SSE,...,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,No,No
6052,2009-01-04,Cobar,19.4,37.6,0.0,10.8,10.6,NNE,46.0,NNE,...,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,No,No
6053,2009-01-05,Cobar,21.9,38.4,0.0,11.4,12.2,WNW,31.0,WNW,...,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,No,No
6054,2009-01-06,Cobar,24.2,41.0,0.0,11.2,8.4,WNW,35.0,NW,...,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,No,No


In [15]:
df['RainTomorrow'] = np.where(df.RainTomorrow == 'Yes', 1, 0)

In [16]:
#df

In [17]:
y = df["RainTomorrow"].value_counts()
y

RainTomorrow
0    43993
1    12427
Name: count, dtype: int64

In [18]:
# Converting all categorical variables into dummy variables
df_dummies = pd.get_dummies(df)
df_dummies.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,RainToday_No,RainToday_Yes
6049,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,...,False,False,False,False,True,False,False,False,True,False
6050,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,...,False,False,True,False,False,False,False,False,True,False
6052,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,...,False,False,False,False,False,False,False,False,True,False
6053,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,...,False,False,False,False,False,False,False,True,True,False
6054,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,...,False,False,False,False,False,False,True,False,True,False


In [19]:
y = df_dummies["RainTomorrow"].value_counts()
y

RainTomorrow
0    43993
1    12427
Name: count, dtype: int64

In [20]:
# Split the data into features (X) and target variable (y)
X = df_dummies.drop("RainTomorrow", axis = 1)
y = df_dummies["RainTomorrow"]

# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # seed in random generation


In [21]:
# Train the logistic regression model
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
# Predicting 
y_pred = logistic_regression.predict(X_test)

In [23]:
y_pred


array([0, 0, 0, ..., 0, 0, 0])

In [24]:
y_test

103197    0
33303     0
136838    1
80964     0
34347     0
         ..
10233     1
37551     1
38646     0
42043     0
79115     0
Name: RainTomorrow, Length: 11284, dtype: int64

In [25]:
f1_lr = f1_score(y_test, y_pred, average='binary')
print(f1_lr)

0.6214401481824496


In [26]:
accuracy_mlp = accuracy_score(y_test, y_pred, normalize=True)
print(accuracy_mlp)

0.8551045728465083


In [27]:
roc_mlp = roc_auc_score(y_test, y_pred, average='macro')
print(roc_mlp)


0.7452416940193965


In [28]:
df_dummies['RainTomorrow'].count()

np.int64(56420)

In [29]:
df_dummies['RainTomorrow'].value_counts()

RainTomorrow
0    43993
1    12427
Name: count, dtype: int64

In [30]:
# Get the minimum class count
min_count = df_dummies['RainTomorrow'].value_counts().min()
print(min_count)

12427


In [31]:
# Randomly sample the dataset to balance it (undersampling)
balanced_df = df.groupby('RainTomorrow').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

# Checking the size of each class after applying undersampling
class_sizes = balanced_df['RainTomorrow'].value_counts()
print(class_sizes)

RainTomorrow
0    12427
1    12427
Name: count, dtype: int64


  balanced_df = df.groupby('RainTomorrow').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


In [32]:
 balanced_df1 = pd.get_dummies( balanced_df)

In [33]:
# Split the data into features (X) and target variable (y)
a= balanced_df1.drop("RainTomorrow", axis = 1)
b = balanced_df1["RainTomorrow"]

# Splot the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(a, b, test_size=0.2, random_state=0) # seed in random generation

In [34]:
# Train the logistic regression model
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
# Predicting 
y_pred = logistic_regression.predict(X_test)

In [36]:
y_pred

array([0, 1, 0, ..., 1, 0, 0])

In [37]:
y_test

11792    0
24592    1
22744    1
6569     0
16318    1
        ..
23134    1
11598    0
18773    1
9769     0
7307     0
Name: RainTomorrow, Length: 4971, dtype: int64

In [38]:
f1_lr = f1_score(y_test, y_pred, average='binary')
print(f1_lr)

0.787952787952788


In [39]:
accuracy_mlp = accuracy_score(y_test, y_pred, normalize=True)
print(accuracy_mlp)

0.7903842285254475


In [40]:
roc_mlp = roc_auc_score(y_test, y_pred, average='macro')
print(roc_mlp)

0.7904019426006257


In [41]:
# Random oversampling
# Count the number of observations in each class
class_counts = df_dummies['RainTomorrow'].value_counts()
print(class_counts)

# Get the maximum class count (majority class count)
max_count = class_counts.max()
print(max_count)

# Perform oversampling
oversampled_df = (df_dummies.groupby('RainTomorrow')
                  .apply(lambda x: x.sample(max_count, replace=True))
                  .reset_index(drop=True))

# Checking the size of each class after applying oversampling
class_sizes = oversampled_df['RainTomorrow'].value_counts()
print(class_sizes)

RainTomorrow
0    43993
1    12427
Name: count, dtype: int64
43993


  .apply(lambda x: x.sample(max_count, replace=True))


RainTomorrow
0    43993
1    43993
Name: count, dtype: int64


In [42]:
# Split the data into features (X) and target variable (y)
X = oversampled_df.drop("RainTomorrow", axis = 1)
y = oversampled_df["RainTomorrow"]

# Splot the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # seed in random generation

In [43]:
# Train the logistic regression model
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
# Predicting 
y_pred = logistic_regression.predict(X_test)

In [45]:
y_pred

array([1, 1, 0, ..., 1, 0, 0])

In [46]:
y_test

50999    1
76295    1
33111    0
56876    1
85492    1
        ..
17170    0
40115    0
42265    0
6425     0
41753    0
Name: RainTomorrow, Length: 17598, dtype: int64

In [47]:
f1_lr = f1_score(y_test, y_pred, average='binary')
print(f1_lr)

0.7964249117613572


In [48]:
accuracy_mlp = accuracy_score(y_test, y_pred, normalize=True)
print(accuracy_mlp)

0.7967950903511762


In [49]:
roc_mlp = roc_auc_score(y_test, y_pred, average='macro')
print(roc_mlp)

0.7968459937575789


In [50]:
# Split the data into features (X) and target variable (y)
X = df_dummies.drop("RainTomorrow", axis = 1)
y = df_dummies["RainTomorrow"]

# Apply SMOTE to balance the dataset
smote = SMOTE()
X, y = smote.fit_resample(X, y)

# Splot the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # seed in random generation


In [51]:
# Train the logistic regression model
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:

# Predicting 
y_pred = logistic_regression.predict(X_test)

In [53]:
y_pred

array([0, 1, 0, ..., 0, 0, 0])

In [54]:
y_test

50999    0
76295    1
33111    0
56876    1
85492    1
        ..
17170    0
40115    0
42265    0
6425     0
41753    0
Name: RainTomorrow, Length: 17598, dtype: int64

In [55]:
f1_lr = f1_score(y_test, y_pred, average='binary')
print(f1_lr)

0.7944374499256037


In [56]:
accuracy_mlp = accuracy_score(y_test, y_pred, normalize=True)
print(accuracy_mlp)

0.7958858961245596


In [57]:
roc_mlp = roc_auc_score(y_test, y_pred, average='macro')
print(roc_mlp)

0.7958883910484191


In [58]:
 #Split the data into features (X) and target variable (y)
X = df_dummies.drop("RainTomorrow", axis = 1)
y = df_dummies["RainTomorrow"]



In [59]:
# Random Under Sampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X,y)# Random Under Sampling


In [60]:
# Splot the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size=0.2, random_state=0) # seed in random generation


In [61]:
# Train the logistic regression model
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [62]:

# Predicting 
y_pred = logistic_regression.predict(X_test)

In [63]:
y_pred

array([0, 0, 1, ..., 1, 0, 0])

In [64]:
y_test

14642     0
141125    1
120323    1
101646    0
62747     1
         ..
122421    1
84779     0
79011     1
64256     0
13544     0
Name: RainTomorrow, Length: 4971, dtype: int64

In [65]:
f1_lr = f1_score(y_test, y_pred, average='binary')
print(f1_lr)

0.7795926598104457


In [66]:
accuracy_mlp = accuracy_score(y_test, y_pred, normalize=True)
print(accuracy_mlp)

0.780124723395695


In [67]:
roc_mlp = roc_auc_score(y_test, y_pred, average='macro')
print(roc_mlp)

0.7801296696601226


In [68]:
 #Split the data into features (X) and target variable (y)
X = df_dummies.drop("RainTomorrow", axis = 1)
y = df_dummies["RainTomorrow"]

In [None]:
# Tomek Links
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X, y) 

In [None]:
# Splot the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl ,test_size=0.2, random_state=0) # seed in random generation

In [None]:
# Train the logistic regression model
logistic_regression = LogisticRegression()

logistic_regression.fit(X_train, y_train)

In [None]:
# Predicting 
y_pred = logistic_regression.predict(X_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
f1_lr = f1_score(y_test, y_pred, average='binary')
print(f1_lr)

In [None]:
accuracy_mlp = accuracy_score(y_test, y_pred, normalize=True)
print(accuracy_mlp)

In [None]:
roc_mlp = roc_auc_score(y_test, y_pred, average='macro')
print(roc_mlp)