# Week 5

In [1]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from statsmodels.tools.tools import add_constant

## Exercise 1

#### Consider the dataset 'weatherAUS' called Rain in Australia. It describes the weather characteristics on different dates and locations. This dataset contains about 10 years of daily weather observations from many locations across Australia. 
#### 1. Preprocess the data, remove the attributes which were are not useful to predict rain. Also, remove rows with at least one missing value for each of them.  
#### 2. Calculate the Variance Inflation Factor (VIF) value. VIF is a number that determines whether a variable has multicollinearity or not (starts from 1, and it has no upper limit. If the number gets larger, it means the variable has huge multicollinearity on it.).  
#### 3. Remove multicollinearities by creating new features. Find the features that have paired values and create the new feature which is the difference value between those pairs. 
#### 4. Remove features that have a VIF value above 5. 
#### 5. Build a regression model to perform the Rain prediction. Also, tabulate accuracy of the prediction models, before and VIF computation.

In [2]:
weather_data = pd.read_csv("weatherAUS.csv")
weather_data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [3]:
# Question 1
drop_columns = ['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RISK_MM']
weather_data = weather_data.drop(columns=drop_columns)
weather_data = weather_data.dropna()

In [4]:
# Question 2
X = weather_data.drop('RainTomorrow', axis=1)
y = weather_data['RainTomorrow']

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X, y, test_size=0.2, shuffle=True)

scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_full_scaled = scaler.transform(X_test_full)

model_full = LogisticRegression(max_iter=1000)
model_full.fit(X_train_full_scaled, y_train_full)
y_pred_full = model_full.predict(X_test_full_scaled)
accuracy_full = accuracy_score(y_test_full, y_pred_full)
print(f'Accuracy before VIF computation: {accuracy_full:.2f}')

Accuracy before VIF computation: 0.86


In [5]:
# Question 3
X_const = add_constant(X)
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X_const.values, i) for i in range(1, X_const.shape[1])]
print("VIF values:\n", vif_data)


VIF values:
           Feature        VIF
0         MinTemp  11.009623
1         MaxTemp  47.132304
2        Rainfall   1.176579
3     Evaporation   2.233761
4        Sunshine   3.337606
5   WindGustSpeed   2.902648
6    WindSpeed9am   1.880932
7    WindSpeed3pm   2.164593
8     Humidity9am   4.460957
9     Humidity3pm   6.808408
10    Pressure9am  20.155036
11    Pressure3pm  20.129619
12       Cloud9am   2.253303
13       Cloud3pm   2.292493
14        Temp9am  24.653650
15        Temp3pm  56.346220


In [6]:
# Question 4
high_vif_features = vif_data[vif_data['VIF'] > 5]['Feature'].tolist()
print(f"High VIF features: {high_vif_features}")

# Question 3 continuation
if 'Humidity9am' in high_vif_features and 'Humidity3pm' in high_vif_features:
    weather_data['Humidity_Diff'] = weather_data['Humidity3pm'] - weather_data['Humidity9am']

weather_data.drop(columns=high_vif_features, inplace=True)

High VIF features: ['MinTemp', 'MaxTemp', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']


In [7]:
# Question 5
X = weather_data.drop('RainTomorrow', axis=1)
y = weather_data['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)


X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model_vif = LogisticRegression(max_iter=1000)
model_vif.fit(X_train_scaled, y_train)
y_pred_vif = model_vif.predict(X_test_scaled)
accuracy_vif = accuracy_score(y_test, y_pred_vif)
print(f'Accuracy after VIF removal: {accuracy_vif:.2f}')

Accuracy after VIF removal: 0.83
