# Weather Prediction

## Import library

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
import math

## Data Exploration

In [36]:
df = pd.read_csv('testset.csv')

## Handling Missing Values

In [44]:
mean_dewptm = round(df[' _dewptm'].mean(),2)
df[' _dewptm'] = df[' _dewptm'].fillna(mean_dewptm)
mean_dewptm

15.74

In [45]:
mean_heatindexm = round(df[' _heatindexm'].mean(),2)
df[' _heatindexm'] = df[' _heatindexm'].fillna(mean_heatindexm)
mean_heatindexm

35.65

In [46]:
mean_hum = round(df[' _hum'].mean(),2)
df[' _hum'] = df[' _hum'].fillna(mean_hum)
mean_hum

57.91

In [47]:
mean_tempm = round(df[' _tempm'].mean(),2)
df[' _tempm'] = df[' _tempm'].fillna(mean_tempm)
mean_tempm

25.45

In [48]:
mean_pressurem= round(df[' _pressurem'].mean(),2)
df[' _pressurem'] = df[' _pressurem'].fillna(mean_pressurem)
mean_pressurem

1932.33

In [49]:
mean_precipm = df[' _precipm'].mean()
mean_precipm

nan

In [50]:
df = df.drop([' _precipm'],axis='columns')

In [51]:
mean_vism = round(df[' _vism'].mean(),2)
df[' _vism'] = df[' _vism'].fillna(mean_vism )
mean_vism

2.4

In [52]:
mean_wdird = round(df[' _wdird'].mean(),2)
df[' _wdird'] = df[' _wdird'].fillna(mean_wdird)
mean_wdird

163.55

In [53]:
mean_wgustm = round(df[' _wgustm'].mean(),2)
df[' _wgustm'] = df[' _wgustm'].fillna(mean_wgustm)
mean_wgustm

37.67

In [54]:
mean_windchillm = round(df[' _windchillm'].mean(),2)
df[' _windchillm'] = df[' _windchillm'].fillna(mean_windchillm)
mean_windchillm

5.71

In [55]:
mean_wspdm = round(df[' _wspdm'].mean(),2)
df[' _wspdm'] = df[' _wspdm'].fillna(mean_wspdm)
mean_wspdm

7.68

In [56]:
most_freq = df[' _conds'].value_counts().idxmax()
df[' _conds'] = df[' _conds'].fillna(most_freq)
most_freq

'Haze'

In [57]:
most_freq = df[' _wdire'].value_counts().idxmax()
df[' _wdire'] = df[' _wdire'].fillna(most_freq)
most_freq

'North'

In [58]:
df.isnull().sum()

datetime_utc    0
 _conds         0
 _dewptm        0
 _fog           0
 _hail          0
 _heatindexm    0
 _hum           0
 _pressurem     0
 _rain          0
 _snow          0
 _tempm         0
 _thunder       0
 _tornado       0
 _vism          0
 _wdird         0
 _wdire         0
 _wgustm        0
 _windchillm    0
 _wspdm         0
dtype: int64

In [59]:
def extract_year(value):
    return (value[0:4])
def extract_month(value):
    return (value[4:6])
df["year"] = df["datetime_utc"].apply(lambda x:extract_year(x))
df["month"] = df["datetime_utc"].apply(lambda x:extract_month(x))

## Building the model 

In [68]:
dummy = pd.get_dummies(df[' _wdire'])
dummy

Unnamed: 0,ENE,ESE,East,NE,NNE,NNW,NW,North,SE,SSE,SSW,SW,South,Variable,WNW,WSW,West
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100985,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
100986,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
100987,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
100988,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False


In [69]:
le = LabelEncoder()

In [70]:
merge = pd.concat([dummy,df],axis='columns')
final = merge.drop([' _wdire','datetime_utc'],axis='columns')
final.head()

Unnamed: 0,ENE,ESE,East,NE,NNE,NNW,NW,North,SE,SSE,...,_tempm,_thunder,_tornado,_vism,_wdird,_wgustm,_windchillm,_wspdm,year,month
0,False,False,False,False,False,False,False,False,False,False,...,30.0,0,0,5.0,280.0,37.67,5.71,7.4,1996,11
1,False,False,False,False,False,False,False,True,False,False,...,28.0,0,0,2.4,0.0,37.67,5.71,7.68,1996,11
2,False,False,False,False,False,False,False,True,False,False,...,24.0,0,0,2.4,0.0,37.67,5.71,7.68,1996,11
3,False,False,False,False,False,False,False,True,False,False,...,24.0,0,0,2.0,0.0,37.67,5.71,7.68,1996,11
4,False,False,False,False,False,False,False,True,False,False,...,23.0,0,0,1.2,0.0,37.67,5.71,0.0,1996,11


In [71]:
final[' _conds_n']=le.fit_transform(final[' _conds'])
final = final.drop([' _conds'],axis ='columns')
final.head()

Unnamed: 0,ENE,ESE,East,NE,NNE,NNW,NW,North,SE,SSE,...,_thunder,_tornado,_vism,_wdird,_wgustm,_windchillm,_wspdm,year,month,_conds_n
0,False,False,False,False,False,False,False,False,False,False,...,0,0,5.0,280.0,37.67,5.71,7.4,1996,11,31
1,False,False,False,False,False,False,False,True,False,False,...,0,0,2.4,0.0,37.67,5.71,7.68,1996,11,31
2,False,False,False,False,False,False,False,True,False,False,...,0,0,2.4,0.0,37.67,5.71,7.68,1996,11,31
3,False,False,False,False,False,False,False,True,False,False,...,0,0,2.0,0.0,37.67,5.71,7.68,1996,11,31
4,False,False,False,False,False,False,False,True,False,False,...,0,0,1.2,0.0,37.67,5.71,0.0,1996,11,31


In [72]:
X = final.drop([' _conds_n'],axis='columns')
X.shape

(100990, 35)

In [73]:
y=final[' _conds_n']
y.shape

(100990,)

In [74]:
X_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [75]:
scaler = MinMaxScaler()

In [76]:
X_train = scaler.fit_transform(X_train)

In [77]:
x_test = scaler.transform(x_test)

In [78]:
model1 = RandomForestClassifier(n_estimators=100, random_state=42)
model2 = SVC(kernel='linear', probability=True, random_state=42)

In [79]:
model2.fit(X_train, y_train)

In [80]:
model1.fit(X_train, y_train)

In [81]:
pred1 = model1.predict(x_test)

In [82]:
pred2 = model2.predict(x_test)

In [83]:
acc1 = accuracy_score(y_test, pred1)
print("Random Forest Accuracy:", acc1)

Random Forest Accuracy: 0.8243390434696505


In [84]:
acc2 = accuracy_score(y_test, pred2)
print("SVM Accuracy:", acc2)

SVM Accuracy: 0.690167343301317


In [92]:
print(df.columns)

Index(['datetime_utc', ' _conds', ' _dewptm', ' _fog', ' _hail',
       ' _heatindexm', ' _hum', ' _pressurem', ' _rain', ' _snow', ' _tempm',
       ' _thunder', ' _tornado', ' _vism', ' _wdird', ' _wdire', ' _wgustm',
       ' _windchillm', ' _wspdm', 'year', 'month'],
      dtype='object')


In [93]:
print(df[' _wdire'])

0          West
1         North
2         North
3         North
4         North
          ...  
100985       NW
100986       NW
100987     West
100988       NW
100989       NW
Name:  _wdire, Length: 100990, dtype: object


In [94]:
def predict_weather(datetime_utc, _dewptm, _fog, _hail, _heatindexm, _hum, _pressurem, _rain, _snow, _tempm, _thunder, _tornado, _vism, _wdird, _wgustm, _windchillm, _wspdm, _wdire):
    # Create a DataFrame with the input features
    input_data = pd.DataFrame({
        'datetime_utc': [datetime_utc],
        ' _dewptm': [_dewptm],
        ' _fog': [_fog],
        ' _hail': [_hail],
        ' _heatindexm': [_heatindexm],
        ' _hum': [_hum],
        ' _pressurem': [_pressurem],
        ' _rain': [_rain],
        ' _snow': [_snow],
        ' _tempm': [_tempm],
        ' _thunder': [_thunder],
        ' _tornado': [_tornado],
        ' _vism': [_vism],
        ' _wdird': [_wdird],
        ' _wgustm': [_wgustm],
        ' _windchillm': [_windchillm],
        ' _wspdm': [_wspdm],
        ' _wdire': [_wdire]
    })

    # Transform categorical features using LabelEncoder
    input_data[' _wdire'] = le.transform(input_data[' _wdire'])
    
    # Use the trained model to make predictions
    input_data = input_data.drop(['datetime_utc'],axis='columns')
    input_data = scaler.transform(input_data)
    prediction = model1.predict(input_data)
    
    # Inverse transform the predicted label
    prediction_label = le.inverse_transform(prediction)[0]
    
    return prediction_label

In [95]:
# Example usage
datetime_utc = '19961101-11:00'
_dewptm = 9.0
_fog = 0
_hail = 0
_heatindexm = math.nan
_hum = 27.0
_pressurem = 1010.0
_rain = 0
_snow = 0
_tempm = 30.0
_thunder = 0
_tornado = 0
_vism = 5.0
_wdird = 280.0
_wgustm = math.nan
_windchillm = math.nan
_wspdm = 7.4
_wdire = 'West'


In [87]:
prediction = predict_weather(datetime_utc, _dewptm, _fog, _hail, _heatindexm, _hum, _pressurem, _rain, _snow, _tempm, _thunder, _tornado, _vism, _wdird, _wgustm, _windchillm, _wspdm, _wdire)
print("Predicted weather condition:", prediction)

ValueError: y contains previously unseen labels: 'West'