In [348]:
import pandas 
import numpy as np
from sklearn.metrics import   accuracy_score,  precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  LabelEncoder
from sklearn.preprocessing import StandardScaler



In [349]:
# load data
data = pandas.read_csv('weather_forecast_data.csv')
data



Unnamed: 0,Temperature,Humidity,Wind_Speed,Cloud_Cover,Pressure,Rain
0,19.096119,71.651723,14.782324,48.699257,987.954760,no rain
1,27.112464,84.183705,13.289986,10.375646,1035.430870,no rain
2,20.433329,42.290424,7.216295,6.673307,1033.628086,no rain
3,19.576659,40.679280,4.568833,55.026758,1038.832300,no rain
4,19.828060,93.353211,0.104489,30.687566,1009.423717,no rain
...,...,...,...,...,...,...
2495,14.684023,82.054139,8.751728,58.939058,1003.418337,rain
2496,20.754521,92.099534,17.305508,70.889921,1049.801435,rain
2497,22.087516,71.530065,0.857918,84.162554,1039.664865,rain
2498,18.542453,97.451961,5.429309,54.643893,1014.769130,rain


In [350]:
numeric_features = ['Temperature', 'Humidity', 'Wind_Speed', 'Cloud_Cover', 'Pressure'
       ]
target = 'Rain'
 
        ##  Task 1: Preprocessing
        
# 1.1: Handling missing values
def handel_missing_values_technique1(data):
    numeric_data = data[numeric_features]
    #  fill missing values with mean column values in the data frame
    data[numeric_features] = numeric_data.fillna(numeric_data.mean())
    return data

def handel_missing_values_technique2(data):
    data = data.dropna()
    return data


print ("Before handling missing values")
data.info()

data = handel_missing_values_technique1(data)
print ("After handling missing values")
data.info()



Before handling missing values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  2475 non-null   float64
 1   Humidity     2460 non-null   float64
 2   Wind_Speed   2468 non-null   float64
 3   Cloud_Cover  2467 non-null   float64
 4   Pressure     2473 non-null   float64
 5   Rain         2500 non-null   object 
dtypes: float64(5), object(1)
memory usage: 117.3+ KB
After handling missing values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  2500 non-null   float64
 1   Humidity     2500 non-null   float64
 2   Wind_Speed   2500 non-null   float64
 3   Cloud_Cover  2500 non-null   float64
 4   Pressure     2500 non-null   float64
 5   Rain         2500 non-null   object 
d

In [351]:
data.describe()

Unnamed: 0,Temperature,Humidity,Wind_Speed,Cloud_Cover,Pressure
count,2500.0,2500.0,2500.0,2500.0,2500.0
mean,22.573777,64.366909,9.911826,49.80877,1014.409327
std,7.295628,19.813325,5.743575,28.869772,20.072933
min,10.001842,30.005071,0.009819,0.015038,980.014486
25%,16.417898,47.493987,4.829795,24.817296,997.190281
50%,22.573777,64.366909,9.911826,49.80877,1014.09539
75%,28.934369,81.445049,14.88966,74.98941,1031.606187
max,34.995214,99.997481,19.999132,99.997795,1049.985593


In [352]:
# 1.2 Scaling and Encoding
scaler = StandardScaler()
labelEncoder = LabelEncoder()

def scale_data(data, isTestData):    
    if not isTestData:
        data = scaler.fit_transform(data)
    else:
        data = scaler.transform(data)
    return pandas.DataFrame(data, columns=numeric_features)

def encode_target(data):
    data= labelEncoder.fit_transform(data[target])
    return pandas.DataFrame(data, columns=[target])

x = data[numeric_features]
y = encode_target(data)[target]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

x_train = scale_data(x_train, False)
x_test = scale_data(x_test, True)





In [353]:
class KNN:
    def __init__(self, k):
        self.k = k
        
    def fit(self, X, y):
        self.X = X
        self.y = y
    def get_nearest_neighbors(self, crnt_X):
        distances = np.sqrt(((self.X - crnt_X) ** 2).sum(axis=1)) # Euclidean distance
        nearest_neighbors_indices = distances.argsort()[:self.k]
        return nearest_neighbors_indices
    
    def predict(self, X):
        predictions = []
        for crnt_X in X:
            nearest_neighbors_indices = self.get_nearest_neighbors(crnt_X)
            cnt = np.sum(self.y[nearest_neighbors_indices])
            if cnt > self.k/2:
                prediction = 1
            else:
                prediction = 0
            predictions.append(prediction)
        return predictions
        
        

In [354]:
class Metrics:
    def __init__(self, y_test, predictions):
        self.accuracy = accuracy_score(y_test, predictions)
        self.precision = precision_score(y_test, predictions)
        self.recall = recall_score(y_test, predictions)
        
    def show(self):
        print("Accuracy: ", self.accuracy)
        print("Precision: ", self.precision)
        print("Recall: ", self.recall)
        print("=====================================")

In [355]:
# Scratch KNN
k_values = [1, 3, 5, 7, 100]

for k in k_values:
    knn = KNN(k)
    knn.fit(x_train.values, y_train.values)
    predictions = knn.predict(x_test.values)
    print("K = ", k)
    Metrics(y_test, predictions).show()
    

K =  1
Accuracy:  0.95
Precision:  0.8571428571428571
Recall:  0.8
K =  3
Accuracy:  0.966
Precision:  0.9027777777777778
Recall:  0.8666666666666667
K =  5
Accuracy:  0.96
Precision:  0.8873239436619719
Recall:  0.84
K =  7
Accuracy:  0.956
Precision:  0.9076923076923077
Recall:  0.7866666666666666
K =  100
Accuracy:  0.926
Precision:  1.0
Recall:  0.5066666666666667


In [356]:
from sklearn.neighbors import KNeighborsClassifier

sklearn_knn = KNeighborsClassifier(n_neighbors=9)
sklearn_knn.fit(x_train, y_train)
predictions = sklearn_knn.predict(x_test)

Metrics(y_test, predictions).show()


Accuracy:  0.964
Precision:  0.9253731343283582
Recall:  0.8266666666666667


In [357]:
from sklearn.naive_bayes import GaussianNB

naive_bayes =  GaussianNB()
naive_bayes.fit(x_train, y_train)
predictions = naive_bayes.predict(x_test)

Metrics(y_test, predictions).show()

Accuracy:  0.952
Precision:  1.0
Recall:  0.68


In [358]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
predictions = decision_tree.predict(x_test)

Metrics(y_test, predictions).show()


Accuracy:  0.996
Precision:  1.0
Recall:  0.9733333333333334
