# import library and Splitting data for training the model

In [1]:
# Importing pandas library
import pandas as pd

# Reading weather forecast data from csv file
weather_data = pd.read_csv('train.csv')
# weather_data

# Converting date column to the appropriate format
weather_data['date'] = pd.to_datetime(weather_data['date'])

# Extracting month from the date
weather_data['month'] = weather_data['date'].dt.month

# Separating rows related to months from April to October
weather_data_apr_oct = weather_data.loc[(weather_data['month'] >= 4) & (weather_data['month'] <= 10)]

# Displaying the separated data
print(weather_data_apr_oct)


           date  precipitation  temp_max  temp_min  wind weather  month
91   2012-04-01            1.5       8.9       4.4   6.8    rain      4
92   2012-04-02            0.0      16.7       4.4   3.1     sun      4
93   2012-04-03            1.5      11.7       3.3   3.1    rain      4
94   2012-04-04            0.0      10.6       2.8   2.1     sun      4
95   2012-04-05            4.6       9.4       2.8   1.8    snow      4
...         ...            ...       ...       ...   ...     ...    ...
1194 2015-04-09            0.0      17.2       6.1   2.3     sun      4
1195 2015-04-10           10.9      13.9       7.8   4.6    rain      4
1196 2015-04-11            0.0      11.7       5.6   6.5     sun      4
1197 2015-04-12            0.0      13.3       5.6   3.6     sun      4
1198 2015-04-13           14.0      11.7       3.9   3.6    rain      4

[655 rows x 7 columns]


# Check have null data
"Cleaning the data is the first step. we remove the null values"

In [2]:
weather_data_apr_oct.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
month            0
dtype: int64

# creat a directory for results

In [3]:
import os
path = 'results'

if not os.path.exists(path):
    os.mkdir(path)
else:
    pass


# Upload the test.csv to remove weather column

In [4]:
df_test = pd.read_csv('test.csv')
df_test.dropna(subset=['weather'], inplace=True)
df_test.isnull().sum()
df_test['weather'] = '' # remove the values weather column of test.csv            
df_test.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,4/14/2015,3.3,11.7,2.8,3.3,
1,4/15/2015,0.0,13.9,3.3,2.4,
2,4/16/2015,0.0,17.8,3.9,3.1,
3,4/17/2015,0.0,18.9,6.1,3.6,
4,4/18/2015,0.0,18.9,8.3,3.9,


# Preprocessing data (encoding data)

We cannot use string values for training, so we need to preprocess and convert them to numerical values (encode the data), which involves converting the categorical variables to numerical values.

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
weather_data_apr_oct['date'] = le.fit_transform(weather_data_apr_oct['date'])
weather_data_apr_oct['precipitation'] = le.fit_transform(weather_data_apr_oct['precipitation'])
weather_data_apr_oct['temp_max'] = le.fit_transform(weather_data_apr_oct['temp_max'])
weather_data_apr_oct['temp_min'] = le.fit_transform(weather_data_apr_oct['temp_min'])
weather_data_apr_oct['wind'] = le.fit_transform(weather_data_apr_oct['wind'])
weather_data_apr_oct['weather'] = le.fit_transform(weather_data_apr_oct['weather'])

# which number belongs to which weather
changing the data type for better performance and memory usage optimization.


In [6]:
# 'drizzle' = 0 , 'rain' = 2 , 'sun' = 4 , 'snow' = 3 , 'fog' = 1
unique_weather = weather_data_apr_oct['weather'].unique().astype('int16')
unique_weather

array([2, 4, 3, 0, 1], dtype=int16)

# We should define the label and remove 'date' from the features.
Here weather is our label and we want predict that with classification


In [7]:
cols = [col for col in df_test.columns if col not in ['weather','date']]  

data = weather_data_apr_oct[cols]      
target = weather_data_apr_oct['weather']


# Training

In [8]:
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(data, target, train_size=0.999, test_size=0.001) 



# Dictionary values
This dictionary is used to set the values of the encoding to strings.

In [9]:
values = {
     0 :'drizzle',
     2 :'rain' ,
     4 :'sun'  , 
     3 :'snow'  ,
     1 :'fog' 
}

df_test

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,4/14/2015,3.3,11.7,2.8,3.3,
1,4/15/2015,0.0,13.9,3.3,2.4,
2,4/16/2015,0.0,17.8,3.9,3.1,
3,4/17/2015,0.0,18.9,6.1,3.6,
4,4/18/2015,0.0,18.9,8.3,3.9,
...,...,...,...,...,...,...
196,10/27/2015,0.0,16.1,7.8,1.7,
197,10/28/2015,3.3,13.9,11.1,2.8,
198,10/29/2015,1.8,15.0,12.2,4.7,
199,10/30/2015,19.3,17.2,11.7,6.7,


# Naive Bayes Model

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
ghb_model = GaussianNB()
ghb_model.fit(data_train, target_train)
pred = ghb_model.predict(data_test)
print('Navy bayes accurency: ', accuracy_score(target_test, pred, normalize=True)) # evaluation of the model of train

# for predict the weather of finaal file & save test_Naive Bayes_model.csv

predicted_weather = ghb_model.predict(df_test[cols]) # for fill the weather column in test.csv 
df_test['weather'] = predicted_weather
df_test['weather'] = df_test['weather'].map(values) # Set the values of weather based on the above dictionary.
df_test.loc[(df_test['precipitation'] == 0) & (df_test['wind'] <= 1.5), 'weather'] = 'fog'  
df_test.loc[(df_test['precipitation'] == 0) & (df_test['temp_min'] <= 3) & (1.5<df_test['wind']) & (df_test['wind'] <= 2), 'weather'] = 'drizzle'
df_test.loc[(df_test['precipitation'] >= 3) & (df_test['wind'] >= 4.1) & (df_test['temp_min'] <= 3.3), 'weather'] = 'snow'  

df_test.to_csv('results//test_Naive Bayes_model.csv', index=False)
df_test

Navy bayes accurency:  1.0


Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,4/14/2015,3.3,11.7,2.8,3.3,rain
1,4/15/2015,0.0,13.9,3.3,2.4,sun
2,4/16/2015,0.0,17.8,3.9,3.1,sun
3,4/17/2015,0.0,18.9,6.1,3.6,sun
4,4/18/2015,0.0,18.9,8.3,3.9,sun
...,...,...,...,...,...,...
196,10/27/2015,0.0,16.1,7.8,1.7,sun
197,10/28/2015,3.3,13.9,11.1,2.8,rain
198,10/29/2015,1.8,15.0,12.2,4.7,rain
199,10/30/2015,19.3,17.2,11.7,6.7,rain


# LinearSVC model

In [11]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
svc_model = LinearSVC()
svc_model.fit(data_train, target_train)
pred = svc_model.predict(data_test)
print(f'LinearSVC accurency: {accuracy_score(target_test, pred,normalize=True)}')

# # for predict the weather of test file & save the new test_LinearSVC

predicted_weather = ghb_model.predict(df_test[cols]) # for fill the weather column in test.csv 
df_test['weather'] = predicted_weather
df_test['weather'] = df_test['weather'].map(values) # Set the values of weather based on the above dictionary.
df_test.loc[(df_test['precipitation'] == 0) & (df_test['wind'] <= 1.5), 'weather'] = 'fog'  
df_test.loc[(df_test['precipitation'] == 0) & (df_test['temp_min'] <= 3) & (1.5<df_test['wind']) & (df_test['wind'] <= 2), 'weather'] = 'drizzle'
df_test.loc[(df_test['precipitation'] >= 3) & (df_test['wind'] >= 4.1) & (df_test['temp_min'] <= 3.3), 'weather'] = 'snow'  

df_test.to_csv('results//test_LinearSVC_model.csv', index=False)
df_test



LinearSVC accurency: 1.0


Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,4/14/2015,3.3,11.7,2.8,3.3,rain
1,4/15/2015,0.0,13.9,3.3,2.4,sun
2,4/16/2015,0.0,17.8,3.9,3.1,sun
3,4/17/2015,0.0,18.9,6.1,3.6,sun
4,4/18/2015,0.0,18.9,8.3,3.9,sun
...,...,...,...,...,...,...
196,10/27/2015,0.0,16.1,7.8,1.7,sun
197,10/28/2015,3.3,13.9,11.1,2.8,rain
198,10/29/2015,1.8,15.0,12.2,4.7,rain
199,10/30/2015,19.3,17.2,11.7,6.7,rain


# K-Neighbors Classifier

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
n_neighbors=6
KN_model = KNeighborsClassifier(n_neighbors= n_neighbors)
KN_model.fit(data_train, target_train)
pred = KN_model.predict(data_test)
print(f'K-Neighbord accurency: {accuracy_score(target_test, pred, normalize=True)}')

# for predict the weather of test file & save the new test_KN_model

predicted_weather = ghb_model.predict(df_test[cols]) # for fill the weather column in test.csv 
df_test['weather'] = predicted_weather
df_test['weather'] = df_test['weather'].map(values) # Set the values of weather based on the above dictionary.
df_test.loc[(df_test['precipitation'] == 0) & (df_test['wind'] <= 1.5), 'weather'] = 'fog'  
df_test.loc[(df_test['precipitation'] == 0) & (df_test['temp_min'] <= 3) & (1.5<df_test['wind']) & (df_test['wind'] <= 2), 'weather'] = 'drizzle'
df_test.loc[(df_test['precipitation'] >= 3) & (df_test['wind'] >= 4.1) & (df_test['temp_min'] <= 3.3), 'weather'] = 'snow'  


df_test.to_csv('results//test_KN_model.csv', index=False)
df_test

K-Neighbord accurency: 1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,4/14/2015,3.3,11.7,2.8,3.3,rain
1,4/15/2015,0.0,13.9,3.3,2.4,sun
2,4/16/2015,0.0,17.8,3.9,3.1,sun
3,4/17/2015,0.0,18.9,6.1,3.6,sun
4,4/18/2015,0.0,18.9,8.3,3.9,sun
...,...,...,...,...,...,...
196,10/27/2015,0.0,16.1,7.8,1.7,sun
197,10/28/2015,3.3,13.9,11.1,2.8,rain
198,10/29/2015,1.8,15.0,12.2,4.7,rain
199,10/30/2015,19.3,17.2,11.7,6.7,rain


# compare the columns of test files

In [13]:
import pandas as pd 

# Load the data
data_main = pd.read_csv('test.csv')
data1 = pd.read_csv('results//test_Naive Bayes_model.csv')
data2 = pd.read_csv('results//test_LinearSVC_model.csv')
data3 = pd.read_csv('results//test_KN_model.csv')

# Select the columns of interest
column_main = data_main["weather"]
column_data_1 = data1["weather"]
column_data_2 = data2["weather"]
column_data_3 = data3["weather"]

# Define a list of columns for each dataframe
columns = [column_main, column_data_1, column_data_2, column_data_3]

# Define a list of names for each dataframe
data_names = ["Main Data", "Naive Bayes", "SVM", "K-NN"]

# Initialize a dictionary to store the results
results = {}

# Iterate over the pairs of dataframes and columns
for i, (df, col) in enumerate(zip([data_main, data1, data2, data3], columns), start=1):
    # Initialize counters for equal and unequal values
    equal_count = 0
    unequal_count = 0

    # Get the name of the dataset
    data_name = data_names[i-1]

    # Iterate over the rows of the columns
    for index, (data_main, data_other) in enumerate(zip(column_main, col)):
        # Skip comparison with the main dataset for the current dataframe
        if i == 1 and data_main == data_other:
            continue
        if data_main == data_other:
            equal_count += 1
        else:
            unequal_count += 1

    # Store the results in the dictionary
    results[f"{data_name} and data_main"] = {"equal": equal_count, "unequal": unequal_count}

    # Print the results for the current dataset
    if i > 1:
        print(f"Comparison between {data_name} and data_main:")
        print(f"Number of equal values: {equal_count}")
        print(f"Number of unequal values: {unequal_count}")
        print()

Comparison between Naive Bayes and data_main:
Number of equal values: 172
Number of unequal values: 29

Comparison between SVM and data_main:
Number of equal values: 172
Number of unequal values: 29

Comparison between K-NN and data_main:
Number of equal values: 172
Number of unequal values: 29

