In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Logistic Regression with weatherAUS.csv

In [2]:
weather = pd.read_csv('weatherAUS.csv')
weather.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [3]:
weather.drop('Date', axis = 1, inplace = True)  # removing date column as it may not be useful 

In [4]:
weather.shape

(145460, 22)

In [5]:
weather.isna().sum()

Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [6]:
weather.dropna(inplace = True)  # dropping all instances with a null value for any attribute 

In [7]:
weather.shape

(56420, 22)

We have dropped all data instances with null values

In [8]:
target = weather.pop('RainTomorrow')  # removing target variable before preprocessing 

In [9]:
weather.dtypes

Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
dtype: object

In [10]:
numerical_cols = weather.select_dtypes(include = ['float64', 'int64']).columns  # find numerical data columns
nominal_cols = weather.select_dtypes(include = ['object']).columns  # find nominal data columns

In [11]:
weather_num = weather[numerical_cols]
weather_nom = weather[nominal_cols]

## Calculating IVF and dropping columns

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [13]:
# calculating IVF by first building a Regression Model using one attribute ad target and all the others as 
# independent variables. Then we obtain r2 score which will help us find the IVF.
lr = LinearRegression()
vif = {}
for col in numerical_cols:
    other_cols = list(numerical_cols)
    other_cols.remove(col)
    X = weather_num[other_cols]
    y = weather_num[col]
    lr.fit(X, y)
    y_pred = lr.predict(X)
    score = r2_score(y, y_pred)
    vif[col] = 1 / (1 - score)
vif

{'MinTemp': 10.770219691409892,
 'MaxTemp': 46.78406795924843,
 'Rainfall': 1.1779953082896788,
 'Evaporation': 2.224815286348291,
 'Sunshine': 3.307730755606259,
 'WindGustSpeed': 2.8788643484723133,
 'WindSpeed9am': 1.856489347720246,
 'WindSpeed3pm': 2.1463942950501784,
 'Humidity9am': 4.417547175413456,
 'Humidity3pm': 6.803333309913033,
 'Pressure9am': 19.876241743158516,
 'Pressure3pm': 19.86913652734502,
 'Cloud9am': 2.238263376838908,
 'Cloud3pm': 2.2774283640218864,
 'Temp9am': 24.36887019910182,
 'Temp3pm': 56.08268800345098}

In [14]:
drop_cols = [key for key in vif.keys() if vif[key] > 5]  # find columns to be dropped due to VIF value
drop_cols

['MinTemp',
 'MaxTemp',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Temp9am',
 'Temp3pm']

In [15]:
weather_num.drop(drop_cols, axis = 1, inplace = True)

## Find closely related attributes and modify attributes

In [16]:
weather_num.corr()  # find correlations

Unnamed: 0,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Cloud9am,Cloud3pm
Rainfall,1.0,-0.077239,-0.246379,0.106308,0.050584,0.044112,0.263625,0.217169,0.191433
Evaporation,-0.077239,1.0,0.36925,0.209566,0.193154,0.124345,-0.554232,-0.199809,-0.202366
Sunshine,-0.246379,0.36925,1.0,-0.052422,-0.013842,0.0292,-0.500343,-0.677939,-0.702022
WindGustSpeed,0.106308,0.209566,-0.052422,1.0,0.608852,0.685236,-0.19341,0.088129,0.13159
WindSpeed9am,0.050584,0.193154,-0.013842,0.608852,1.0,0.502226,-0.236795,0.034908,0.062507
WindSpeed3pm,0.044112,0.124345,0.0292,0.685236,0.502226,1.0,-0.100626,0.068224,0.041475
Humidity9am,0.263625,-0.554232,-0.500343,-0.19341,-0.236795,-0.100626,1.0,0.438962,0.348707
Cloud9am,0.217169,-0.199809,-0.677939,0.088129,0.034908,0.068224,0.438962,1.0,0.61438
Cloud3pm,0.191433,-0.202366,-0.702022,0.13159,0.062507,0.041475,0.348707,0.61438,1.0


We will cosider variables that have a absolute correlation higher than 0.6.  

Cloud9am, Cloud3pm  
WindGustSpeed, WindSpeed3pm  
WindGustSpeed, WindSpeed9am  
Sunshine, Cloud9am  

We will create features with their differences instead

In [17]:
numerical_cols = list(numerical_cols)
numerical_cols.remove('Cloud9am')
numerical_cols.remove('Cloud3pm')
numerical_cols.remove('WindGustSpeed')
numerical_cols.remove('WindSpeed3pm')
numerical_cols.remove('WindSpeed9am')
numerical_cols.remove('Sunshine')

In [18]:
weather_num = weather[numerical_cols]
weather_num['Cloud9am - Cloud3pm'] = weather['Cloud9am'] - weather['Cloud3pm']
weather_num['WindGustSpeed - WindSpeed3pm'] = weather['WindGustSpeed'] - weather['WindSpeed3pm']
weather_num['WindGustSpeed - WindSpeed9am'] = weather['WindGustSpeed'] - weather['WindSpeed9am']
weather_num['Sunshine - Cloud9am'] = weather['Sunshine'] - weather['Cloud9am']
weather_num.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,Cloud9am - Cloud3pm,WindGustSpeed - WindSpeed3pm,WindGustSpeed - WindSpeed9am,Sunshine - Cloud9am
6049,17.9,35.2,0.0,12.0,20.0,13.0,1006.3,1004.4,26.6,33.4,-3.0,28.0,42.0,10.3
6050,18.4,28.9,0.0,14.8,30.0,8.0,1012.9,1012.1,20.3,27.0,0.0,18.0,18.0,12.0
6052,19.4,37.6,0.0,10.8,42.0,22.0,1012.3,1009.2,28.7,34.9,-5.0,31.0,16.0,9.6
6053,21.9,38.4,0.0,11.4,37.0,22.0,1012.7,1009.1,29.1,35.6,-4.0,25.0,25.0,11.2
6054,24.2,41.0,0.0,11.2,19.0,15.0,1010.7,1007.4,33.6,37.6,-5.0,22.0,18.0,7.4


In [19]:
temp_df = pd.merge(weather_num, weather_nom, left_index = True, right_index = True)
temp_df

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,Cloud9am - Cloud3pm,WindGustSpeed - WindSpeed3pm,WindGustSpeed - WindSpeed9am,Sunshine - Cloud9am,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday
6049,17.9,35.2,0.0,12.0,20.0,13.0,1006.3,1004.4,26.6,33.4,-3.0,28.0,42.0,10.3,Cobar,SSW,ENE,SW,No
6050,18.4,28.9,0.0,14.8,30.0,8.0,1012.9,1012.1,20.3,27.0,0.0,18.0,18.0,12.0,Cobar,S,SSE,SSE,No
6052,19.4,37.6,0.0,10.8,42.0,22.0,1012.3,1009.2,28.7,34.9,-5.0,31.0,16.0,9.6,Cobar,NNE,NNE,NNW,No
6053,21.9,38.4,0.0,11.4,37.0,22.0,1012.7,1009.1,29.1,35.6,-4.0,25.0,25.0,11.2,Cobar,WNW,WNW,WSW,No
6054,24.2,41.0,0.0,11.2,19.0,15.0,1010.7,1007.4,33.6,37.6,-5.0,22.0,18.0,7.4,Cobar,WNW,NW,WNW,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142298,19.3,33.4,0.0,6.0,63.0,32.0,1013.9,1010.5,24.5,32.3,-1.0,15.0,26.0,11.0,Darwin,ENE,SE,NE,No
142299,21.2,32.6,0.0,7.6,56.0,28.0,1014.6,1011.2,24.8,32.0,7.0,26.0,24.0,1.6,Darwin,E,SE,SE,No
142300,20.7,32.8,0.0,5.6,46.0,23.0,1015.3,1011.8,24.8,32.1,0.0,22.0,16.0,11.0,Darwin,E,E,W,No
142301,19.5,31.8,0.0,6.2,62.0,58.0,1014.9,1010.7,24.8,29.2,0.0,9.0,17.0,9.6,Darwin,ESE,SE,NNW,No


In [20]:
final_df = pd.get_dummies(temp_df)  # final_df ready with all features preprocessed
final_df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,...,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,RainToday_No,RainToday_Yes
6049,17.9,35.2,0.0,12.0,20.0,13.0,1006.3,1004.4,26.6,33.4,...,0,0,0,0,1,0,0,0,1,0
6050,18.4,28.9,0.0,14.8,30.0,8.0,1012.9,1012.1,20.3,27.0,...,0,0,1,0,0,0,0,0,1,0
6052,19.4,37.6,0.0,10.8,42.0,22.0,1012.3,1009.2,28.7,34.9,...,0,0,0,0,0,0,0,0,1,0
6053,21.9,38.4,0.0,11.4,37.0,22.0,1012.7,1009.1,29.1,35.6,...,0,0,0,0,0,0,0,1,1,0
6054,24.2,41.0,0.0,11.2,19.0,15.0,1010.7,1007.4,33.6,37.6,...,0,0,0,0,0,0,1,0,1,0


## Classification using processed dataset

In [21]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [22]:
X_train, X_test, y_train, y_test = train_test_split(final_df, target, test_size = 0.2, random_state = 42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

array(['No', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)

In [23]:
accuracy_score(y_test, y_pred)

0.8543069833392414

After all the preprocessing and cleaning, we managed to provide a dataset that could train a Logistic Regression model to 84% accuracy.