In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [3]:
df_data=pd.read_csv('./csv_files/weatherAUS.csv')

In [19]:
df_data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [13]:
# Finding the shape of a dataset

df_data.shape

(145460, 23)

In [9]:
# describing the attributes 

df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [12]:
# finding categorical columns and continous column from the dataset file

def different_types():
    categorical_col=[]
    continous_col=[]
    for col in df_data.columns:
        if df_data[col].dtypes=="object":
            categorical_col.append(col)
        else:
            continous_col.append(col)
    print(f'Categorical Columns are: \n{categorical_col}')
    print(f'Continous Columns are: \n{continous_col}')

different_types()

Categorical Columns are: 
['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']
Continous Columns are: 
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']


In [14]:
# finding the missing values

df_data.isna().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [20]:
# transforing 'yes' to 1 and 'no' to 0 in some columns

df_data['RainToday']=df_data['RainToday'].map({'Yes':1, 'No':0})
df_data['RainTomorrow']=df_data['RainTomorrow'].map({'Yes':1, 'No':0})

In [24]:
print(f'{df_data.RainToday}\n{df_data.RainTomorrow}')

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
145455    0.0
145456    0.0
145457    0.0
145458    0.0
145459    0.0
Name: RainToday, Length: 145460, dtype: float64
0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
145455    0.0
145456    0.0
145457    0.0
145458    0.0
145459    NaN
Name: RainTomorrow, Length: 145460, dtype: float64


In [35]:
# finding the percentage of missing value in each columns

percentage_per_column=(df_data.isnull().sum()/df_data.shape[0])*100

In [32]:
percentage_per_column

Date              0.000000
Location          0.000000
MinTemp           1.020899
MaxTemp           0.866905
Rainfall          2.241853
Evaporation      43.166506
Sunshine         48.009762
WindGustDir       7.098859
WindGustSpeed     7.055548
WindDir9am        7.263853
WindDir3pm        2.906641
WindSpeed9am      1.214767
WindSpeed3pm      2.105046
Humidity9am       1.824557
Humidity3pm       3.098446
Pressure9am      10.356799
Pressure3pm      10.331363
Cloud9am         38.421559
Cloud3pm         40.807095
Temp9am           1.214767
Temp3pm           2.481094
RainToday         2.241853
RainTomorrow      2.245978
dtype: float64

## Note: Handling the missing value by mean() in Continous features

In [37]:
df_data['MinTemp'] = df_data['MinTemp'].fillna(df_data['MinTemp'].mean())
df_data['MaxTemp'] = df_data['MaxTemp'].fillna(df_data['MaxTemp'].mean())
df_data['Rainfall'] = df_data['Rainfall'].fillna(df_data['Rainfall'].mean())
df_data['Evaporation'] = df_data['Evaporation'].fillna(df_data['Evaporation'].mean())
df_data['Sunshine'] = df_data['Sunshine'].fillna(df_data['Sunshine'].mean())
df_data['WindGustSpeed'] = df_data['WindGustSpeed'].fillna(df_data['WindGustSpeed'].mean())
df_data['WindSpeed9am'] = df_data['WindSpeed9am'].fillna(df_data['WindSpeed9am'].mean())
df_data['WindSpeed3pm'] = df_data['WindSpeed3pm'].fillna(df_data['WindSpeed3pm'].mean())
df_data['Humidity9am'] = df_data['Humidity9am'].fillna(df_data['Humidity9am'].mean())
df_data['Humidity3pm'] = df_data['Humidity3pm'].fillna(df_data['Humidity3pm'].mean())
df_data['Pressure9am'] = df_data['Pressure9am'].fillna(df_data['Pressure9am'].mean())
df_data['Pressure3pm'] = df_data['Pressure3pm'].fillna(df_data['Pressure3pm'].mean())
df_data['Cloud9am'] = df_data['Cloud9am'].fillna(df_data['Cloud9am'].mean())
df_data['Cloud3pm'] = df_data['Cloud3pm'].fillna(df_data['Cloud3pm'].mean())
df_data['Temp9am'] = df_data['Temp9am'].fillna(df_data['Temp9am'].mean())
df_data['Temp3pm'] = df_data['Temp3pm'].fillna(df_data['Temp3pm'].mean())

## Note: Handling the missing value by mode() in categorical features

In [40]:
df_data['WindGustDir']=df_data['WindGustDir'].fillna(df_data['WindGustDir'].mode()[0])
df_data['WindDir9am']=df_data['WindDir9am'].fillna(df_data['WindDir9am'].mode()[0])
df_data['WindDir3pm']=df_data['WindDir3pm'].fillna(df_data['WindDir3pm'].mode()[0])
df_data['RainToday']=df_data['RainToday'].fillna(df_data['RainToday'].mode()[0])
df_data['RainTomorrow']=df_data['RainTomorrow'].fillna(df_data['RainTomorrow'].mode()[0])

In [41]:
# now checking the percentage of missing records

percentage_per_column_updated=(df_data.isnull().sum()/df_data.shape[0])*100
percentage_per_column_updated

Date             0.0
Location         0.0
MinTemp          0.0
MaxTemp          0.0
Rainfall         0.0
Evaporation      0.0
Sunshine         0.0
WindGustDir      0.0
WindGustSpeed    0.0
WindDir9am       0.0
WindDir3pm       0.0
WindSpeed9am     0.0
WindSpeed3pm     0.0
Humidity9am      0.0
Humidity3pm      0.0
Pressure9am      0.0
Pressure3pm      0.0
Cloud9am         0.0
Cloud3pm         0.0
Temp9am          0.0
Temp3pm          0.0
RainToday        0.0
RainTomorrow     0.0
dtype: float64

In [44]:
# dropint the date column

df_data=df_data.drop(columns=['Date'])

In [53]:
# applying encoding for the categorical columns


le = preprocessing.LabelEncoder()
df_data['Location'] = le.fit_transform(df_data['Location'])
df_data['WindDir9am'] = le.fit_transform(df_data['WindDir9am'])
df_data['WindDir3pm'] = le.fit_transform(df_data['WindDir3pm'])
df_data['WindGustDir'] = le.fit_transform(df_data['WindGustDir'])

In [58]:
# droping high correlated columns

df_data=df_data.drop(['Temp3pm','Temp9am','Humidity9am'],axis=1)

In [59]:
# now, we will split the data into training and testing data

x_train, x_test, y_train, y_test = train_test_split(df_data.iloc[:,:-1], df_data.iloc[:,-1], test_size=0.2, random_state=42)

In [60]:
x_train, y_train

(        Location  MinTemp  MaxTemp  Rainfall  Evaporation   Sunshine  \
 111548         1     14.5     17.8      14.2     5.468232   7.611178   
 142528        16     24.6     39.5       0.0    12.000000   7.611178   
 65304         19     14.7     26.6       0.2     3.200000   8.000000   
 40651         45     18.3     26.6       0.0     5.468232   7.611178   
 97487          0     11.4     19.3       0.6     2.400000   8.900000   
 ...          ...      ...      ...       ...          ...        ...   
 110268         1     10.0     17.2       0.0     9.600000   8.800000   
 119879        32      7.7     21.1       0.0     1.400000   8.900000   
 103694        28      7.6     21.7       0.0     6.000000  11.200000   
 131932        15      8.8     14.8       0.0     1.200000   2.000000   
 121958        31     15.8     40.6       0.0     9.200000  11.300000   
 
         WindGustDir  WindGustSpeed  WindDir9am  WindDir3pm  WindSpeed9am  \
 111548           13       40.03523          

In [61]:
# Selecting the model that is: ""linearRegression"" model

model=LogisticRegression()

In [64]:
# the "fit" function is used to train the model on the training dataset

model.fit(x_train, y_train)
# our model is fitted.

In [65]:
# the "score" function evaluates the model's performance based on the data passed to it

model_score=model.score(x_train, y_train)
print(model_score)
if model_score>0.5:
    print("Our Trained model is Perfect.")
elif model_score<0.5:
    print("Our Trained model is not Perfect.")
else:
    print("Model is not Trained.")

0.8401880241990926
Our Trained model is Perfect.


In [68]:
# the "predict" function is used to predicts the values point on unseen data points

y_predict=model.predict(x_test)
y_predict

array([0., 0., 0., ..., 0., 0., 0.])

In [95]:
conf = confusion_matrix(y_test, y_predict)
print(f'The confusion matrix of this Logistic Regression is: \n{conf}')

The confusion matrix of this Logistic Regression is: 
[[21516  1156]
 [ 3525  2895]]


In [88]:
tn, fp, fn, tp = conf.ravel()
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("True Positives:", tp)
specificity = tn / (tn + fp)
print("Specificity:", specificity)

True Negatives: 21516
False Positives: 1156
False Negatives: 3525
True Positives: 2895
Specificity: 0.9490119971771348


In [94]:
print ("The accuracy of Logistic Regression is : ", accuracy_score(y_test, y_predict))
print("F1 score for logistic regression is :",f1_score(y_test, y_predict,))

The accuracy of Logistic Regression is :  0.8390966588752922
F1 score for logistic regression is : 0.5529557826377615


In [84]:
precision = precision_score(y_test, y_predict)
print("Precision Score of logistic Regression is:", precision)

Precision Score of logistic Regression is: 0.7146383608985436


In [87]:
recall = recall_score(y_test, y_predict)
print("Recall Score of logistic Regression is:", recall)

Recall Score of logistic Regression is: 0.45093457943925236


In [89]:
log_loss_value = log_loss(y_test, y_predict)
print("Log Loss for this logistics Regression is:", log_loss_value)

Log Loss for this logistics Regression is: 5.799544256649849
