# Importing Libraries

In [33]:
import numpy as np
import pandas as pd
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from sklearn.metrics import r2_score

os.chdir("C:/Users/user/Desktop/PYTHON/Machine Leraning Models")

# Importing Dataset

In [2]:
data = pd.read_csv("weatherAUS.csv")
x  = data.iloc[:, [1,2,3,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]].values
y = data.iloc[:,-1].values

In [3]:
print(x)

[['Albury' 13.4 22.9 ... 16.9 21.8 'No']
 ['Albury' 7.4 25.1 ... 17.2 24.3 'No']
 ['Albury' 12.9 25.7 ... 21.0 23.2 'No']
 ...
 ['Hobart' 7.6 16.4 ... 10.2 15.5 'Yes']
 ['Hobart' 9.7 16.9 ... 10.9 15.7 'Yes']
 [nan nan nan ... nan nan nan]]


In [4]:
print(y)

['No' 'No' 'No' ... 'Yes' 'No' nan]


In [5]:
y = y.reshape(-1,1)
print(y)

[['No']
 ['No']
 ['No']
 ...
 ['Yes']
 ['No']
 [nan]]


# Cleaning Dataset 

In [6]:
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
x = imputer.fit_transform(x)
y = imputer.fit_transform(y)

In [7]:
print(x)
print(y)

[['Albury' 13.4 22.9 ... 16.9 21.8 'No']
 ['Albury' 7.4 25.1 ... 17.2 24.3 'No']
 ['Albury' 12.9 25.7 ... 21.0 23.2 'No']
 ...
 ['Hobart' 7.6 16.4 ... 10.2 15.5 'Yes']
 ['Hobart' 9.7 16.9 ... 10.9 15.7 'Yes']
 ['Canberra' 9.6 20.0 ... 17.0 20.0 'No']]
[['No']
 ['No']
 ['No']
 ...
 ['Yes']
 ['No']
 ['No']]


In [8]:
# Replacing strings with numerical data(Encoding Dataset)
le1 = LabelEncoder()
x [:,0]= le1.fit_transform(x[:,0])
le2 = LabelEncoder()
x[:,4] = le2.fit_transform(x[:,4]) 
le3 = LabelEncoder()
x[:,6] = le3.fit_transform(x[:,6])
le4 = LabelEncoder()
x[:,7] = le4.fit_transform(x[:,7])
le5 = LabelEncoder()
x[:,-1] = le5.fit_transform(x[:,-1])
le6 = LabelEncoder()
y = le6.fit_transform(y)

  return f(*args, **kwargs)


In [9]:
print(x)

[[2 13.4 22.9 ... 16.9 21.8 0]
 [2 7.4 25.1 ... 17.2 24.3 0]
 [2 12.9 25.7 ... 21.0 23.2 0]
 ...
 [13 7.6 16.4 ... 10.2 15.5 1]
 [13 9.7 16.9 ... 10.9 15.7 1]
 [8 9.6 20.0 ... 17.0 20.0 0]]


In [10]:
print(y)

[0 0 0 ... 1 0 0]


In [11]:
# Feature Scaling
sc = StandardScaler()
x = sc.fit_transform(x)

In [12]:
print(x)

[[-1.52042673  0.25211794  0.02588312 ...  0.04659483  0.08779409
  -0.5360392 ]
 [-1.52042673 -0.73425123  0.34840009 ...  0.09552006  0.46631749
  -0.5360392 ]
 [-1.52042673  0.16992051  0.43635926 ...  0.71523969  0.2997672
  -0.5360392 ]
 ...
 [-0.65998973 -0.70137226 -0.92700792 ... -1.04606873 -0.86608487
   1.8655352 ]
 [-0.65998973 -0.35614305 -0.85370861 ... -0.93190985 -0.835803
   1.8655352 ]
 [-1.05109746 -0.37258253 -0.39925288 ...  0.06290324 -0.18474275
  -0.5360392 ]]


# Splitting Dataset into Training and Test set

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.6, test_size=0.3,random_state=100)

In [14]:
print(x_train)

[[ 1.29554893 -1.04660147 -0.92700792 ... -0.78513415 -0.95693049
   1.8655352 ]
 [ 1.60843512  1.17272917  0.12850216 ...  0.91094063  0.25434439
  -0.5360392 ]
 [ 0.82621965  0.4165128   2.12224342 ...  1.40019297  2.05611577
  -0.5360392 ]
 ...
 [ 1.13910584 -1.47402811 -1.95319828 ... -1.76363883 -2.13792349
  -0.5360392 ]
 [-0.73821127  0.97545533  0.52431844 ...  0.97617427  0.48145843
  -0.5360392 ]
 [-0.81643282 -0.7671302  -1.45476296 ... -1.30700331 -1.35059482
  -0.5360392 ]]


In [15]:
print(y_train)

[1 0 0 ... 1 0 0]


# Training and Testing Model 

In [16]:
# LogisticRegession
rain_prob = LogisticRegression()
rain_prob.fit(x_train, y_train)
rain_prob.score(x_test, y_test)

0.8388750659730076

In [24]:
y_pred = rain_prob.predict(x_test)
y_pred

array([0, 0, 1, ..., 0, 0, 0])

In [26]:
y_pred = le6.inverse_transform(y_pred)

In [20]:
y_pred

array(['No', 'No', 'Yes', ..., 'No', 'No', 'No'], dtype=object)

In [21]:
print(y_test)

[1 0 1 ... 0 1 0]


In [22]:
y_test = le6.inverse_transform(y_test)

In [23]:
print(y_test)

['Yes' 'No' 'Yes' ... 'No' 'Yes' 'No']


In [27]:
y_test = y_test.reshape(-1,1)
y_pred = y_pred.reshape(-1,1)

In [29]:
# Concatenating Predictions into a Dataframe
df = np.concatenate((y_test, y_pred),axis=1)
dataframe = pd.DataFrame(df,columns=["Rain on Tommorrow", "Prediction of Rain"])

In [30]:
print(dataframe)

      Rain on Tommorrow Prediction of Rain
0                   Yes                 No
1                    No                 No
2                   Yes                Yes
3                    No                 No
4                    No                 No
...                 ...                ...
39784                No                Yes
39785                No                 No
39786                No                 No
39787               Yes                 No
39788                No                 No

[39789 rows x 2 columns]


In [31]:
accuracy_score(y_test, y_pred)

0.8388750659730076

In [38]:
# Linear Regession 
X_train = sm.add_constant(x_train)
rain = sm.OLS(y_train, X_train).fit()

In [39]:
rain.params

array([ 0.22368381, -0.0059986 ,  0.0104078 ,  0.00057661,  0.02026052,
        0.00399082,  0.09892713, -0.01324335,  0.00570815, -0.00032962,
       -0.04390546, -0.01220735,  0.17423573,  0.11977177, -0.17616457,
       -0.00439151,  0.03128373, -0.03433529,  0.01966012,  0.04649193])

In [40]:
rain.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.293
Model:,OLS,Adj. R-squared:,0.292
Method:,Least Squares,F-statistic:,1731.0
Date:,"Fri, 09 Sep 2022",Prob (F-statistic):,0.0
Time:,18:26:53,Log-Likelihood:,-29426.0
No. Observations:,79576,AIC:,58890.0
Df Residuals:,79556,BIC:,59080.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2237,0.001,180.131,0.000,0.221,0.226
x1,-0.0060,0.001,-4.791,0.000,-0.008,-0.004
x2,0.0104,0.003,3.140,0.002,0.004,0.017
x3,0.0006,0.006,0.093,0.926,-0.012,0.013
x4,0.0203,0.001,13.730,0.000,0.017,0.023
x5,0.0040,0.002,2.532,0.011,0.001,0.007
x6,0.0989,0.002,50.152,0.000,0.095,0.103
x7,-0.0132,0.001,-9.474,0.000,-0.016,-0.011
x8,0.0057,0.002,3.601,0.000,0.003,0.009

0,1,2,3
Omnibus:,8179.533,Durbin-Watson:,2.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10927.237
Skew:,0.892,Prob(JB):,0.0
Kurtosis:,3.336,Cond. No.,15.6


In [42]:
yhat = rain.predict(X_train)
err = (y_train - yhat)
err

array([ 0.42082356, -0.36863956, -0.06550085, ...,  0.63519842,
       -0.31116568, -0.30189274])

In [43]:
X_test = sm.add_constant(x_test)
Y_test_pred = rain.predict(X_test)
Y_test_pred

array([0.42843464, 0.27647977, 0.7828861 , ..., 0.39336703, 0.26887029,
       0.22992819])