READING DATA FROM A .CSV FILE

In [7]:
import pandas as pd

df = pd.read_csv('Weather_Prediction_Data.csv')
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.0,12.8,5.0,4.7,drizzle
1,1/2/2012,10.9,10.6,2.8,4.5,rain
2,1/3/2012,0.8,11.7,7.2,2.3,rain
3,1/4/2012,20.3,12.2,5.6,4.7,rain
4,1/5/2012,1.3,8.9,2.8,6.1,rain


CHECKING IF THE DATA CONSISTS OF ANY NULL VALUES

In [8]:
df.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

CONVERTING THE VALUES OF THE WEATHER COLUMN TO NUMERIC USING LABEL_ENCODER

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['weather'] = le.fit_transform(df['weather'])
df['weather'].unique()

df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.0,12.8,5.0,4.7,0
1,1/2/2012,10.9,10.6,2.8,4.5,2
2,1/3/2012,0.8,11.7,7.2,2.3,2
3,1/4/2012,20.3,12.2,5.6,4.7,2
4,1/5/2012,1.3,8.9,2.8,6.1,2


NORMALIZING THE REMAINING COLUMNS

In [10]:
cols = ['precipitation', 'temp_max', 'temp_min', 'wind']

for x in cols:
  df[x] = df[x] / df[x].max()

df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.0,0.359551,0.273224,0.494737,0
1,1/2/2012,0.194991,0.297753,0.153005,0.473684,2
2,1/3/2012,0.014311,0.328652,0.393443,0.242105,2
3,1/4/2012,0.363148,0.342697,0.306011,0.494737,2
4,1/5/2012,0.023256,0.25,0.153005,0.642105,2


REMOVING THE DATE COLUMN

In [11]:
df = df.drop('date', axis = 1)
df.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather
0,0.0,0.359551,0.273224,0.494737,0
1,0.194991,0.297753,0.153005,0.473684,2
2,0.014311,0.328652,0.393443,0.242105,2
3,0.363148,0.342697,0.306011,0.494737,2
4,0.023256,0.25,0.153005,0.642105,2


SPLITTING THE DATA INTO X AND Y

In [12]:
x = df.drop('weather', axis = 1)
y = df['weather']

SPLITTING X AND Y INTO TRAINING SET AND TESTING SET 

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

USING XGBOOST FOR PREDICTION AND TRAINING IT ON THE TRAINING SET

In [14]:
from xgboost import XGBClassifier

xbc = XGBClassifier()
xbc.fit(x_train, y_train)

PREDICTING VALUES BASED ON THE TESTING SET

In [15]:
y_predict = xbc.predict(x_test)
print(y_predict)

[4 0 2 4 1 2 2 2 4 2 4 2 4 4 1 4 4 2 2 2 2 2 2 4 0 0 4 2 4 4 2 4 2 2 4 2 4
 2 4 2 4 2 2 2 4 4 0 4 4 4 4 4 4 2 2 4 4 2 4 2 4 4 4 4 1 2 4 4 4 2 2 2 2 1
 4 4 4 4 2 2 4 4 2 2 2 0 4 2 4 2 4 4 2 4 2 3 4 4 4 4 2 2 4 2 2 4 2 2 4 4 2
 4 2 4 4 4 4 1 2 2 4 2 2 4 2 4 4 4 4 2 4 2 4 2 2 2 4 4 4 4 2 2 4 4 4 2 2 4
 4 4 4 4 4 2 4 2 4 1 2 4 4 4 2 4 2 2 1 2 2 2 4 4 4 4 2 2 4 4 4 4 4 4 2 4 2
 1 4 2 2 4 4 4 4 4 2 2 4 4 4 4 2 2 4 4 2 2 2 4 4 1 2 2 2 2 1 4 4 2 2 2 4 4
 4 2 4 4 4 4 2 4 0 2 2 2 2 4 2 1 2 2 4 2 4 2 2 4 2 4 0 4 4 4 2 3 4 4 4 4 4
 4 2 1 4 4 4 2 4 2 4 1 4 4 2 2 0 4 2 4 2 4 2 2 2 4 0 4 2 4 2 4 4 2 4]


COMPARING THE ORIGINAL VALUES TO THE PREDICTED VALUES TO SEE HOW WELL THE MODEL PERFORMED

In [16]:
from sklearn.metrics import classification_report, accuracy_score

print('AC : ', accuracy_score(y_test, y_predict) * 100)
print('CR : ', classification_report(y_test, y_predict))

AC :  75.76791808873719
CR :                precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.15      0.07      0.10        29
           2       0.95      0.91      0.93       123
           3       1.00      0.33      0.50         6
           4       0.70      0.85      0.77       125

    accuracy                           0.76       293
   macro avg       0.56      0.43      0.46       293
weighted avg       0.73      0.76      0.74       293



USING GRID_SEARCH_CV TO BETTER TRAIN THE MODEL

In [17]:
from sklearn.model_selection import GridSearchCV

grid  = {'learning_rate' : [0.1, 1, 0.01, 0.001], 'gamma' : [0, 1, 10, 100]}
model = GridSearchCV(xbc, grid, cv = 10, verbose = 2)

model.fit(x_train, y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.4s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.4s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.7s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.4s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.4s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.6s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.6s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.5s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END ...........................gamma=0, learning_rate=1; total time=   0.3s
[CV] END ...........................gamma=0, l

PREDICTING NEW VALUES AS THE MODEL IS NOW UPGRADED

In [18]:
grid_predict = model.predict(x_test)
print(grid_predict)

[4 1 2 4 4 2 2 2 4 2 4 2 4 4 4 4 4 2 2 2 2 2 2 4 4 4 4 2 4 4 2 4 2 2 4 2 4
 2 4 2 4 2 2 2 4 4 4 4 4 4 4 4 4 2 2 4 4 2 4 2 4 4 4 4 4 2 4 4 4 2 2 2 2 4
 4 4 4 4 2 2 4 4 2 2 2 4 4 2 4 2 4 4 2 4 2 3 4 4 4 4 2 4 4 2 2 4 2 2 4 4 2
 4 2 4 4 4 4 4 2 2 4 2 2 4 2 4 4 4 4 2 4 2 4 2 2 2 4 4 4 4 2 2 4 4 4 2 2 4
 4 4 4 4 4 2 4 2 4 4 2 4 4 4 2 4 2 2 4 2 2 2 4 4 4 4 2 2 4 4 4 4 4 4 2 1 2
 4 4 4 2 4 4 4 4 4 2 2 4 4 4 4 2 4 4 4 2 2 2 4 4 4 2 2 2 2 4 4 4 2 2 2 4 4
 4 2 4 4 4 4 2 4 4 2 2 2 2 4 2 4 2 2 4 2 4 2 2 4 2 4 4 4 4 4 2 3 4 4 4 4 4
 4 2 1 4 4 4 2 4 2 4 1 4 4 2 2 4 4 2 4 2 4 2 2 2 4 4 4 2 4 2 4 4 2 4]


COMPARING ORIGINAL VALUES TO THE PREDICTED VALUES TO SEE HOW WELL THE UPGRADED MODEL PERFORMED

In [19]:
print('AC : ', accuracy_score(y_test, grid_predict) * 100)
print('CR : ', classification_report(y_test, grid_predict))

AC :  80.88737201365188
CR :                precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.25      0.03      0.06        29
           2       0.97      0.90      0.93       123
           3       1.00      0.33      0.50         6
           4       0.72      0.98      0.83       125

    accuracy                           0.81       293
   macro avg       0.59      0.45      0.46       293
weighted avg       0.76      0.81      0.76       293



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


CHECKING THE BEST ESTIMATOR

In [20]:
print(model.best_estimator_)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
