In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("weather_classification_data.csv")

In [3]:
df.shape

(13200, 11)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


In [5]:
df.head(10)

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy
5,32.0,55,3.5,26.0,overcast,1010.03,2,Summer,5.0,inland,Cloudy
6,-2.0,97,8.0,86.0,overcast,990.87,1,Winter,4.0,inland,Snowy
7,3.0,85,6.0,96.0,partly cloudy,984.46,1,Winter,3.5,inland,Snowy
8,3.0,83,6.0,66.0,overcast,999.44,0,Winter,1.0,mountain,Snowy
9,28.0,74,8.5,107.0,clear,1012.13,8,Winter,7.5,coastal,Sunny


In [6]:
df['Cloud Cover'].unique()

array(['partly cloudy', 'clear', 'overcast', 'cloudy'], dtype=object)

In [7]:
df['Season'].unique()

array(['Winter', 'Spring', 'Summer', 'Autumn'], dtype=object)

In [8]:
df['Location'].unique()

array(['inland', 'mountain', 'coastal'], dtype=object)

In [9]:
df['Weather Type'].unique()

array(['Rainy', 'Cloudy', 'Sunny', 'Snowy'], dtype=object)

In [10]:
df.isnull().sum()

Temperature             0
Humidity                0
Wind Speed              0
Precipitation (%)       0
Cloud Cover             0
Atmospheric Pressure    0
UV Index                0
Season                  0
Visibility (km)         0
Location                0
Weather Type            0
dtype: int64

In [11]:
df.duplicated().sum()

0

In [12]:
df.describe()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km)
count,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0
mean,19.127576,68.710833,9.832197,53.644394,1005.827896,4.005758,5.462917
std,17.386327,20.194248,6.908704,31.946541,37.199589,3.8566,3.371499
min,-25.0,20.0,0.0,0.0,800.12,0.0,0.0
25%,4.0,57.0,5.0,19.0,994.8,1.0,3.0
50%,21.0,70.0,9.0,58.0,1007.65,3.0,5.0
75%,31.0,84.0,13.5,82.0,1016.7725,7.0,7.5
max,109.0,109.0,48.5,109.0,1199.21,14.0,20.0


In [13]:
num_col = df.select_dtypes(['int64','float64'])
cat_col = df.select_dtypes('object')

In [14]:
num_col

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km)
0,14.0,73,9.5,82.0,1010.82,2,3.5
1,39.0,96,8.5,71.0,1011.43,7,10.0
2,30.0,64,7.0,16.0,1018.72,5,5.5
3,38.0,83,1.5,82.0,1026.25,7,1.0
4,27.0,74,17.0,66.0,990.67,1,2.5
...,...,...,...,...,...,...,...
13195,10.0,74,14.5,71.0,1003.15,1,1.0
13196,-1.0,76,3.5,23.0,1067.23,1,6.0
13197,30.0,77,5.5,28.0,1012.69,3,9.0
13198,3.0,76,10.0,94.0,984.27,0,2.0


In [15]:
cat_col

Unnamed: 0,Cloud Cover,Season,Location,Weather Type
0,partly cloudy,Winter,inland,Rainy
1,partly cloudy,Spring,inland,Cloudy
2,clear,Spring,mountain,Sunny
3,clear,Spring,coastal,Sunny
4,overcast,Winter,mountain,Rainy
...,...,...,...,...
13195,overcast,Summer,mountain,Rainy
13196,cloudy,Winter,coastal,Snowy
13197,overcast,Autumn,coastal,Cloudy
13198,overcast,Winter,inland,Snowy


In [16]:
for col in cat_col:
    print(col,":",df[col].unique())

Cloud Cover : ['partly cloudy' 'clear' 'overcast' 'cloudy']
Season : ['Winter' 'Spring' 'Summer' 'Autumn']
Location : ['inland' 'mountain' 'coastal']
Weather Type : ['Rainy' 'Cloudy' 'Sunny' 'Snowy']


In [17]:
df['Location'] = df['Location'].map({'inland':0, 'mountain':1, 'coastal':2})

In [18]:
df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,0,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,0,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,1,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,2,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,1,Rainy


In [19]:
df['Cloud Cover'] = df['Cloud Cover'].map({'partly cloudy':0, 'clear':1, 'overcast':2, 'cloudy':3})

In [20]:
df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,0,1010.82,2,Winter,3.5,0,Rainy
1,39.0,96,8.5,71.0,0,1011.43,7,Spring,10.0,0,Cloudy
2,30.0,64,7.0,16.0,1,1018.72,5,Spring,5.5,1,Sunny
3,38.0,83,1.5,82.0,1,1026.25,7,Spring,1.0,2,Sunny
4,27.0,74,17.0,66.0,2,990.67,1,Winter,2.5,1,Rainy


In [21]:
df['Season'] = df['Season'].map({'Winter':0, 'Spring':1, 'Summer':2, 'Autumn':3 })

In [22]:
df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,0,1010.82,2,0,3.5,0,Rainy
1,39.0,96,8.5,71.0,0,1011.43,7,1,10.0,0,Cloudy
2,30.0,64,7.0,16.0,1,1018.72,5,1,5.5,1,Sunny
3,38.0,83,1.5,82.0,1,1026.25,7,1,1.0,2,Sunny
4,27.0,74,17.0,66.0,2,990.67,1,0,2.5,1,Rainy


In [23]:
df['Weather Type'] = df['Weather Type'].map({'Rainy':0, 'Cloudy':1, 'Sunny':2, 'Snowy':3})

In [24]:
df.tail(20)

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
13180,10.0,62,17.0,83.0,0,1013.09,3,2,3.0,1,0
13181,-7.0,97,0.5,99.0,2,983.91,0,0,3.0,1,3
13182,-7.0,67,11.5,54.0,2,980.31,0,0,2.0,0,3
13183,28.0,26,6.5,10.0,1,1016.53,10,1,7.5,2,2
13184,3.0,62,7.5,14.0,2,1128.35,3,2,7.5,1,0
13185,37.0,106,23.5,104.0,0,998.71,14,2,3.0,2,0
13186,25.0,66,5.5,39.0,2,1018.86,4,1,8.0,2,1
13187,4.0,71,18.5,88.0,2,992.81,1,0,4.5,1,3
13188,25.0,34,3.5,16.0,1,1022.64,10,0,9.5,0,2
13189,24.0,49,7.5,11.0,1,1022.86,7,2,9.5,0,2


In [25]:
X = df.drop('Weather Type',axis=1)
Y = df['Weather Type']

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
x_train, x_test, y_train, y_test = train_test_split (X, Y, test_size=0.20, random_state=42)

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBRFClassifier
from sklearn.neighbors import KNeighborsClassifier

In [29]:
decision_model = DecisionTreeClassifier()
random_model = RandomForestClassifier()
adaboost_model = AdaBoostClassifier()
gradient_model = GradientBoostingClassifier()
xgboost_model = XGBRFClassifier()
knn_model = KNeighborsClassifier()

In [30]:
decision_model.fit(x_train, y_train)

In [31]:
random_model.fit(x_train, y_train)

In [32]:
adaboost_model.fit(x_train, y_train)

In [33]:
gradient_model.fit(x_train, y_train)

In [34]:
xgboost_model.fit(x_train, y_train)

In [35]:
knn_model.fit(x_train, y_train)

In [36]:
y_pred_decision = decision_model.predict(x_test)

In [37]:
y_pred_ran = random_model.predict(x_test)

In [38]:
y_pred_ada = adaboost_model.predict(x_test)

In [39]:
y_pred_gra = gradient_model.predict(x_test)

In [40]:
y_pred_xg = xgboost_model.predict(x_test)

In [41]:
y_pred_knn = knn_model.predict(x_test)

Decision Tree Score

In [42]:
decision_model.score(x_train, y_train)

1.0

In [43]:
decision_model.score(x_test,y_test)

0.9053030303030303

Random score

In [44]:
random_model.score(x_train, y_train)

1.0

In [45]:
random_model.score(x_test,y_test)

0.9143939393939394

Adaboost score

In [46]:
adaboost_model.score(x_train, y_train)

0.8758522727272727

In [47]:
adaboost_model.score(x_test, y_test)

0.8700757575757576

Gradient Score

In [48]:
gradient_model.score(x_train, y_train)

0.9493371212121212

In [49]:
gradient_model.score(x_test, y_test)

0.9102272727272728

XGBoost Score 

In [50]:
xgboost_model.score(x_train, y_train)

0.9650568181818182

In [51]:
xgboost_model.score(x_test, y_test)

0.9106060606060606

KNN Score

In [52]:
knn_model.score(x_train, y_train)

0.9232954545454546

In [53]:
knn_model.score(x_test, y_test)

0.884090909090909

In [57]:
import pickle
import warnings
warnings.filterwarnings("ignore")

In [58]:
with open('weather.pkl','wb') as file:
    pickle.dump(random_model,file)

In [59]:
L=[[17.0,75,10.5,82,0,1020.82,3,0,3.7,0]]
random_model.predict(L)

array([2], dtype=int64)