In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/weather-dataset/weatherHistory.csv


# Load Dataset

In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('../input/weather-dataset/weatherHistory.csv')
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [3]:
df.shape

(96453, 12)

# Exploratory Data Analysis

Remove variables which do not have much impact on the prediction. 
Calculate the value of loud cover: it is same for all rows, so I can remove it.


In [4]:
df['Loud Cover'].value_counts()

0.0    96453
Name: Loud Cover, dtype: int64

I also remove Daily Summary, since it is text

In [5]:
df['Daily Summary'].value_counts()

Mostly cloudy throughout the day.                                                                       20085
Partly cloudy throughout the day.                                                                        9981
Partly cloudy until night.                                                                               6169
Partly cloudy starting in the morning.                                                                   5184
Foggy in the morning.                                                                                    4201
                                                                                                        ...  
Breezy starting overnight continuing until morning and foggy overnight.                                    24
Mostly cloudy throughout the day and breezy starting overnight continuing until afternoon.                 24
Partly cloudy starting in the morning and breezy starting in the afternoon continuing until evening.       24
Rain until

In [6]:
df.drop(['Daily Summary','Loud Cover'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,1015.13
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,1015.63
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,1015.94
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,1016.41
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,1016.51


Check Null values

In [8]:
df.isnull().sum()

Formatted Date                0
Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Pressure (millibars)          0
dtype: int64

I will use the precip type column as output of the dataset. 

In [9]:
df['Precip Type'].value_counts()

rain    85224
snow    10712
Name: Precip Type, dtype: int64

Drop Null values

In [10]:
df.dropna(inplace=True)
df.shape

(95936, 10)

# Data Cleaning

Convert categorical data into numerical data

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Precip Type']=le.fit_transform(df['Precip Type'])
df['Summary']=le.fit_transform(df['Summary'])
df.head()


Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,2006-04-01 00:00:00.000 +0200,19,0,9.472222,7.388889,0.89,14.1197,251.0,15.8263,1015.13
1,2006-04-01 01:00:00.000 +0200,19,0,9.355556,7.227778,0.86,14.2646,259.0,15.8263,1015.63
2,2006-04-01 02:00:00.000 +0200,17,0,9.377778,9.377778,0.89,3.9284,204.0,14.9569,1015.94
3,2006-04-01 03:00:00.000 +0200,19,0,8.288889,5.944444,0.83,14.1036,269.0,15.8263,1016.41
4,2006-04-01 04:00:00.000 +0200,17,0,8.755556,6.977778,0.83,11.0446,259.0,15.8263,1016.51


Standardize input features

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[df.columns[2:]] = scaler.fit_transform(df[df.columns[2:]])
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,2006-04-01 00:00:00.000 +0200,19,-0.354531,-0.257951,-0.324102,0.792748,0.478964,0.591157,1.309107,0.102152
1,2006-04-01 01:00:00.000 +0200,19,-0.354531,-0.270141,-0.339134,0.63947,0.499902,0.665655,1.309107,0.106415
2,2006-04-01 02:00:00.000 +0200,17,-0.354531,-0.267819,-0.138532,0.792748,-0.99362,0.153478,1.100806,0.109058
3,2006-04-01 03:00:00.000 +0200,19,-0.354531,-0.381594,-0.458873,0.486192,0.476638,0.758778,1.309107,0.113066
4,2006-04-01 04:00:00.000 +0200,17,-0.354531,-0.332833,-0.36246,0.486192,0.03463,0.665655,1.309107,0.113919


# Feature Engineering

Standardize Data. Extract day of year and hour of day

In [13]:
import numpy as np
from datetime import datetime

def discretize_date(current_date, t):
    current_date = current_date[:-10]
    cdate = datetime.strptime(current_date, '%Y-%m-%d %H:%M:%S')

    if t == 'hour_sin':
        return np.sin(2 * np.pi * cdate.hour/24.0)
    if t == 'hour_cos':
        return np.cos(2 * np.pi * cdate.hour/24.0)
    if t == 'day_sin':
        return np.sin(2 * np.pi * cdate.timetuple().tm_yday/365.0)
    if t == 'day_cos':
        return np.cos(2 * np.pi * cdate.timetuple().tm_yday/365.0)

In [14]:
date_types = ['hour_sin', 'hour_cos', 'day_sin', 'day_cos']
for dt in date_types:
    df[dt] = df['Formatted Date'].apply(lambda x : discretize_date(x, dt))
df.drop(['Formatted Date'],axis=1,inplace=True)

In [15]:
df.corr()    # checking correlation to drop unnecessary variable

Unnamed: 0,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars),hour_sin,hour_cos,day_sin,day_cos
Summary,1.0,-0.116821,0.145821,0.141956,-0.112823,0.003068,0.023057,0.135659,0.144449,-0.115476,-0.126093,0.042354,-0.097738
Precip Type,-0.116821,1.0,-0.563503,-0.566058,0.232622,-0.067771,-0.042142,-0.316483,0.009271,0.080189,0.077198,0.115773,0.414884
Temperature (C),0.145821,-0.563503,1.0,0.992637,-0.632148,0.008304,0.029937,0.393719,-0.005253,-0.213442,-0.22488,-0.210074,-0.807055
Apparent Temperature (C),0.141956,-0.566058,0.992637,1.0,-0.60239,-0.057387,0.028951,0.382589,-3.7e-05,-0.201702,-0.206186,-0.21574,-0.81585
Humidity,-0.112823,0.232622,-0.632148,-0.60239,1.0,-0.224569,0.000814,-0.370511,0.005375,0.36661,0.39505,-0.046786,0.392201
Wind Speed (km/h),0.003068,-0.067771,0.008304,-0.057387,-0.224569,1.0,0.104366,0.100801,-0.049453,-0.076933,-0.232838,0.134139,0.104584
Wind Bearing (degrees),0.023057,-0.042142,0.029937,0.028951,0.000814,0.104366,1.0,0.0489,-0.011629,-0.010008,0.006792,0.01082,-0.069343
Visibility (km),0.135659,-0.316483,0.393719,0.382589,-0.370511,0.100801,0.0489,1.0,0.061011,-0.059377,0.153076,0.035113,-0.433034
Pressure (millibars),0.144449,0.009271,-0.005253,-3.7e-05,0.005375,-0.049453,-0.011629,0.061011,1.0,0.003404,0.013111,0.006703,-0.013925
hour_sin,-0.115476,0.080189,-0.213442,-0.201702,0.36661,-0.076933,-0.010008,-0.059377,0.003404,1.0,-1e-05,-0.000182,9.6e-05


Apparent Temperature and Temperature are highly correlated (correlation almost equal to 1). So, we can drop one of them.

In [16]:
df.drop(['Apparent Temperature (C)'],axis=1,inplace=True)

# Train Test Splitting

In [17]:
y=df.iloc[:,0] 
X = df.iloc[:,1:]  

In [18]:
X

Unnamed: 0,Precip Type,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars),hour_sin,hour_cos,day_sin,day_cos
0,-0.354531,-0.257951,0.792748,0.478964,0.591157,1.309107,0.102152,0.000000,1.000000,0.999991,0.004304
1,-0.354531,-0.270141,0.639470,0.499902,0.665655,1.309107,0.106415,0.258819,0.965926,0.999991,0.004304
2,-0.354531,-0.267819,0.792748,-0.993620,0.153478,1.100806,0.109058,0.500000,0.866025,0.999991,0.004304
3,-0.354531,-0.381594,0.486192,0.476638,0.758778,1.309107,0.113066,0.707107,0.707107,0.999991,0.004304
4,-0.354531,-0.332833,0.486192,0.034630,0.665655,1.309107,0.113919,0.866025,0.500000,0.999991,0.004304
...,...,...,...,...,...,...,...,...,...,...,...
96448,-0.354531,1.470719,-1.557511,0.027651,-1.457551,1.374684,0.095586,-0.965926,0.258819,-0.936881,-0.349647
96449,-0.354531,1.320955,-1.302048,-0.102625,-1.559986,1.243531,0.102407,-0.866025,0.500000,-0.936881,-0.349647
96450,-0.354531,1.055095,-0.893308,-0.263144,-1.466863,1.374684,0.106671,-0.707107,0.707107,-0.936881,-0.349647
96451,-0.354531,1.001110,-0.688937,-0.039813,-1.559986,1.374684,0.109144,-0.500000,0.866025,-0.936881,-0.349647


In [19]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [20]:
X_train.shape

(67155, 11)

In [21]:
X_test.shape

(28781, 11)

# Training the Model

In [22]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=32,n_estimators=120,random_state=1)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

# Measuring Accuracy

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6695389319342622

Train the same model without the datetime feature

In [24]:
model = RandomForestClassifier(max_depth=32,n_estimators=120,random_state=1)
model.fit(X_train[X_train.columns[:-4]],y_train)
y_pred = model.predict(X_test[X_test.columns[:-4]])

In [25]:
accuracy_score(y_test, y_pred)

0.5827108161634411