In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
data = pd.read_csv('weatherAUS.csv')

In [3]:
# removing unnecessary features 

data.dropna(inplace=True)
data.drop(['RISK_MM', 'Date'], 1,inplace=True)
data.reset_index(inplace=True, drop=True)

# Getting numerical columns 

num_identifier = (data.dtypes == np.float64)
numerical_columns = num_identifier[num_identifier==True].index

# one hot encoding categorical features, except already binary ones

dummy_cols = list(set(data.columns) - set(numerical_columns) - set(['RainToday', 'RainTomorrow']))
data_one_hot = pd.get_dummies(data, columns=dummy_cols)

# Mapping yes and no to 1 and 0

data_one_hot['RainToday'] = data_one_hot['RainToday'].map({'Yes': 1, 'No': 0})
data_one_hot['RainTomorrow'] = data_one_hot['RainTomorrow'].map({'Yes': 1, 'No': 0})

In [4]:
data_one_hot['RainTomorrowNOT'] = (data_one_hot['RainTomorrow'] + 1) % 2

df1 = data_one_hot.sample(500, random_state=13, weights='RainTomorrow')
df2 = data_one_hot.sample(500, random_state=17, weights='RainTomorrowNOT')

In [5]:
new_data = pd.concat((df1, df2))
new_data = new_data.sample(frac=1).reset_index(drop=True)
new_data.drop('RainTomorrowNOT', axis=1, inplace=True)
new_data

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindGustDir_NNW,WindGustDir_NW,WindGustDir_S,WindGustDir_SE,WindGustDir_SSE,WindGustDir_SSW,WindGustDir_SW,WindGustDir_W,WindGustDir_WNW,WindGustDir_WSW
0,12.9,27.3,0.0,13.6,12.5,46.0,24.0,17.0,50.0,20.0,...,0,0,0,0,1,0,0,0,0,0
1,16.7,30.6,0.0,10.6,10.3,44.0,22.0,24.0,58.0,34.0,...,0,0,0,0,0,0,0,0,0,0
2,11.8,18.2,1.4,2.0,8.2,44.0,15.0,22.0,79.0,48.0,...,0,1,0,0,0,0,0,0,0,0
3,18.4,25.3,0.0,7.0,0.5,48.0,26.0,20.0,42.0,36.0,...,0,0,0,0,0,0,0,0,0,0
4,22.2,31.4,0.0,10.8,10.2,35.0,11.0,22.0,55.0,51.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,19.5,23.8,4.6,5.0,5.6,44.0,7.0,20.0,93.0,51.0,...,0,0,0,0,0,0,1,0,0,0
996,6.5,27.9,0.0,6.4,10.8,35.0,17.0,20.0,76.0,40.0,...,0,0,1,0,0,0,0,0,0,0
997,8.7,14.9,1.4,2.8,3.6,57.0,20.0,24.0,81.0,67.0,...,1,0,0,0,0,0,0,0,0,0
998,14.3,21.2,0.2,8.4,9.4,59.0,22.0,24.0,39.0,29.0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(new_data.drop('RainTomorrow', axis=1), 
                                                    new_data['RainTomorrow'], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

### Logistic Regression

In [15]:
lr = LogisticRegression(random_state=0).fit(X_train, y_train)



In [14]:
print("accuracy score: ", accuracy_score(np.array(y_test), lr.predict(X_test)))
print("precision_score: ", precision_score(np.array(y_test), lr.predict(X_test)))
print("recall score: ", recall_score(np.array(y_test), lr.predict(X_test)))
print("f1 score: ", f1_score(np.array(y_test), lr.predict(X_test)))

accuracy score:  0.725
precision_score:  0.7425742574257426
recall score:  0.7211538461538461
f1 score:  0.7317073170731707


### SVM Regression

In [17]:
svm = SVC(gamma='auto').fit(X_train, y_train)

In [16]:
print("accuracy score: ", accuracy_score(np.array(y_test), svm.predict(X_test)))
print("precision_score: ", precision_score(np.array(y_test), svm.predict(X_test)))
print("recall score: ", recall_score(np.array(y_test), svm.predict(X_test)))
print("f1 score: ", f1_score(np.array(y_test), svm.predict(X_test)))

accuracy score:  0.72
precision_score:  0.6935483870967742
recall score:  0.8269230769230769
f1 score:  0.7543859649122807


### Naive Bayes

In [18]:
gnb = GaussianNB().fit(X_train, y_train)

In [19]:
print("accuracy score: ", accuracy_score(np.array(y_test), gnb.predict(X_test)))
print("precision_score: ", precision_score(np.array(y_test), gnb.predict(X_test)))
print("recall score: ", recall_score(np.array(y_test), gnb.predict(X_test)))
print("f1 score: ", f1_score(np.array(y_test), gnb.predict(X_test)))

accuracy score:  0.67
precision_score:  0.6727272727272727
recall score:  0.7115384615384616
f1 score:  0.6915887850467289
