# TRYING OUT CLASSIFICATION MODELS TO PREDICT IF THE WEATHER IS RAINY OR NOT BASED ON DIFFERENT PARAMETERS

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# **Importing the dataset**

In [2]:
dataset = pd.read_csv('/content/Weather Data.csv')
dataset.head(8)

Unnamed: 0,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather
0,-1.8,-3.9,86,4,8.0,101.24,Fog
1,-1.8,-3.7,87,4,8.0,101.24,Fog
2,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,-1.5,-3.3,88,7,4.8,101.23,Fog
5,-1.4,-3.3,87,9,6.4,101.27,Fog
6,-1.5,-3.1,89,7,6.4,101.29,Fog
7,-1.4,-3.6,85,7,8.0,101.26,Fog


In [3]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
print(X)

[[ -1.8   -3.9   86.     4.     8.   101.24]
 [ -1.8   -3.7   87.     4.     8.   101.24]
 [ -1.8   -3.4   89.     7.     4.   101.26]
 ...
 [ -0.5   -1.5   93.    28.     4.8   99.95]
 [ -0.2   -1.8   89.    28.     9.7   99.91]
 [  0.    -2.1   86.    30.    11.3   99.89]]


In [5]:
print(y)

['Fog' 'Fog' 'Freezing Drizzle,Fog' ... 'Snow' 'Snow' 'Snow']


In [6]:
X.shape

(8784, 6)

In [7]:
y.shape

(8784,)

# **Data preprocessing**

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Temp_C            8784 non-null   float64
 1   Dew Point Temp_C  8784 non-null   float64
 2   Rel Hum_%         8784 non-null   int64  
 3   Wind Speed_km/h   8784 non-null   int64  
 4   Visibility_km     8784 non-null   float64
 5   Press_kPa         8784 non-null   float64
 6   Weather           8784 non-null   object 
dtypes: float64(4), int64(2), object(1)
memory usage: 480.5+ KB


In [9]:
dataset.shape

(8784, 7)

In [10]:
dataset.describe()

Unnamed: 0,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa
count,8784.0,8784.0,8784.0,8784.0,8784.0,8784.0
mean,8.798144,2.555294,67.431694,14.945469,27.664447,101.051623
std,11.687883,10.883072,16.918881,8.688696,12.622688,0.844005
min,-23.3,-28.5,18.0,0.0,0.2,97.52
25%,0.1,-5.9,56.0,9.0,24.1,100.56
50%,9.3,3.3,68.0,13.0,25.0,101.07
75%,18.8,11.8,81.0,20.0,25.0,101.59
max,33.0,24.4,100.0,83.0,48.3,103.65


# **MISSING VALUES **

In [11]:
dataset.isnull().sum()

Temp_C              0
Dew Point Temp_C    0
Rel Hum_%           0
Wind Speed_km/h     0
Visibility_km       0
Press_kPa           0
Weather             0
dtype: int64

No missing values in this dataset

## **#Drop Date/Time column**

In [12]:
dataset

Unnamed: 0,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather
0,-1.8,-3.9,86,4,8.0,101.24,Fog
1,-1.8,-3.7,87,4,8.0,101.24,Fog
2,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,-1.5,-3.3,88,7,4.8,101.23,Fog
...,...,...,...,...,...,...,...
8779,0.1,-2.7,81,30,9.7,100.13,Snow
8780,0.2,-2.4,83,24,9.7,100.03,Snow
8781,-0.5,-1.5,93,28,4.8,99.95,Snow
8782,-0.2,-1.8,89,28,9.7,99.91,Snow


i dropped this column manually since كنش راضي ينعملوو drop hunaa
ERROR 503 :)

In [13]:
# Print the column names of the DataFrame
#print(dataset.columns)

In [14]:
# Convert the numpy array to a pandas dataframe
#X = pd.DataFrame(X)

# Drop the 'Date/Time' column
#dataset.drop('Date/Time', axis=1, inplace=True)

# **Categorical values**

In [15]:
# List of categories representing rainy weather conditions
rainy_categories = ['Drizzle', 'Rain', 'Rain Showers', 'Moderate Rain,Fog',
                    'Rain Showers,Fog', 'Rain Showers,Snow Showers',
                    'Rain,Fog', 'Rain,Ice Pellets', 'Rain,Snow',
                    'Rain,Snow Grains', 'Rain,Snow,Fog', 'Rain,Snow,Ice Pellets',
                    'Thunderstorms', 'Thunderstorms,Heavy Rain Showers',
                    'Thunderstorms,Moderate Rain Showers,Fog', 'Thunderstorms,Rain',
                    'Thunderstorms,Rain Showers', 'Thunderstorms,Rain Showers,Fog',
                    'Thunderstorms,Rain,Fog']

# Map categories to "Rainy" or "Not Rainy"
y_binary = ['Rainy' if category in rainy_categories else 'Not Rainy' for category in y]

# Print the first few elements of y_binary to verify the transformation
print(y_binary[:10])  # Print the first 10 elements as an example


['Not Rainy', 'Not Rainy', 'Not Rainy', 'Not Rainy', 'Not Rainy', 'Not Rainy', 'Not Rainy', 'Not Rainy', 'Not Rainy', 'Not Rainy']


In [16]:
# Assuming y_binary contains the labels "Rainy" and "Not Rainy"
y_binary_numeric = [1 if label == 'Rainy' else 0 for label in y_binary]

# Print the first few elements of y_binary_numeric to verify the transformation
print(y_binary_numeric[:10])  # Print the first 10 elements as an example


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


**Splitting the dataset into the Training set and Test set**

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [18]:
print(X_train.shape)

(7027, 6)


In [19]:
print(X_test.shape)

(1757, 6)


In [20]:
print(y_train.shape)

(7027,)


In [21]:
print(y_test.shape)

(1757,)


# **Logistic Regression Model**

In [22]:
from sklearn.linear_model import LogisticRegression
model_Logistic = LogisticRegression()
model_Logistic.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


##Prediction

In [23]:
y_pred_Logistic = model_Logistic.predict(X_test)
print(y_pred_Logistic)

['Rain' 'Clear' 'Mainly Clear' ... 'Clear' 'Mainly Clear' 'Mostly Cloudy']


In [24]:
print(y_test)

['Rain' 'Clear' 'Mainly Clear' ... 'Cloudy' 'Mainly Clear' 'Rain Showers']


# **EVALUATION**

In [25]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_test, y_pred_Logistic)*100)
print(confusion_matrix(y_test, y_pred_Logistic))

36.08423449060899
[[ 76  47   0   0   0   0   2   0   0   0   0   0   0 102  35   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 38 149   0   0   0   0   0   0   0   0   0   0   0  58  50   7   0   0
    0   0   0   0   0   9   0   0   0   0   0   0   0]
 [  0   3   0   0   0   0   1   0   0   0   0   0   0   0   0   2   0   0
    0   0   0   0   0   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   8   0   0   0   0   0   0   0   0   1   0   0
    0   9   0   0   0   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0  25   0   0   0   0   0   0   0   0   0   0   0
    0   4   0   0   0   3   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0

#Decision Tree Model

In [26]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

In [27]:
y_pred_decision_tree = classifier.predict(X_test)
print(y_pred_decision_tree)

['Rain' 'Clear' 'Clear' ... 'Cloudy' 'Mainly Clear' 'Rain Showers']


# **EVALUATION**

In [28]:
print(accuracy_score(y_test, y_pred_decision_tree)*100)
print(confusion_matrix(y_test, y_pred_decision_tree))

43.02788844621514
[[115  19   0 ...   0   0   0]
 [ 27 139   3 ...   0   0   0]
 [  0   0   3 ...   0   0   0]
 ...
 [  0   1   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]


# **Random Forest**

In [29]:
from sklearn.ensemble import RandomForestClassifier
classifier2 = RandomForestClassifier(n_estimators=100, random_state=1)
classifier2.fit(X_train, y_train)

# **PREDICTION**

In [30]:
y_pred_random_forest = classifier2.predict(X_test)
print(y_pred_random_forest)

['Rain' 'Clear' 'Mainly Clear' ... 'Cloudy' 'Mainly Clear' 'Mostly Cloudy']


In [31]:
y_test = pd.Series(y_test)
print(y_test.values)

['Rain' 'Clear' 'Mainly Clear' ... 'Cloudy' 'Mainly Clear' 'Rain Showers']


# **EVALUATION**

In [32]:
print(accuracy_score(y_test, y_pred_random_forest)*100)
print(confusion_matrix(y_test, y_pred_random_forest))

52.36198064883324
[[141  13   0 ...   0   0   0]
 [ 13 186   0 ...   0   0   0]
 [  0   2   1 ...   0   0   0]
 ...
 [  0   1   0 ...   0   0   0]
 [  0   0   0 ...   1   0   0]
 [  0   0   0 ...   0   0   0]]


# **XGBOOST**

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_numeric, test_size=0.2, random_state=1)

In [37]:
from xgboost import XGBClassifier
classifier3 = XGBClassifier()
classifier3.fit(X_train, y_train)

##Prediction

In [38]:
y_pred_xgboost = classifier3.predict(X_test)
print(y_pred_xgboost)

[1 0 0 ... 0 0 0]


# **EVALUATION**

In [39]:
print(accuracy_score(y_test, y_pred_xgboost)*100)
print(confusion_matrix(y_test, y_pred_xgboost))

94.30848036425725
[[1576   22]
 [  78   81]]


# **CONCLUSION**


# Based on the performance of the four models on the dataset:

*   XGBoost significantly outperformed all other models with an accuracy of 94.31%.
*   Random Forest showed the next best performance with an accuracy of 52.36%.
*   Decision Tree had a moderate accuracy of 43.03%.
*   Logistic Regression yielded the lowest accuracy among the models with 36.08%.

In summary, XGBoost demonstrated superior predictive performance, followed by Random Forest, while Decision Tree and Logistic Regression exhibited comparatively lower accuracies.