## Importing the Libraries

In [1]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import jaccard_score, log_loss, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

## Loading the Dataset

In [3]:
df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv")

In [4]:
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


In [5]:
df.shape

(3271, 22)

## Preprocessing the Dataset

In [6]:
df.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0,3271.0
mean,14.877102,23.005564,3.342158,5.175787,7.16897,41.476307,15.077041,19.294405,68.243962,54.698563,1018.334424,1016.003085,4.318557,4.176093,17.821461,21.543656
std,4.55471,4.483752,9.917746,2.757684,3.815966,10.806951,7.043825,7.453331,15.086127,16.279241,7.02009,7.019915,2.526923,2.411274,4.894316,4.297053
min,4.3,11.7,0.0,0.0,0.0,17.0,0.0,0.0,19.0,10.0,986.7,989.8,0.0,0.0,6.4,10.2
25%,11.0,19.6,0.0,3.2,4.25,35.0,11.0,15.0,58.0,44.0,1013.7,1011.3,2.0,2.0,13.8,18.4
50%,14.9,22.8,0.0,4.8,8.3,41.0,15.0,19.0,69.0,56.0,1018.6,1016.3,5.0,4.0,18.2,21.3
75%,18.8,26.0,1.4,7.0,10.2,44.0,20.0,24.0,80.0,64.0,1023.1,1020.8,7.0,7.0,21.7,24.5
max,27.6,45.8,119.4,18.4,13.6,96.0,54.0,57.0,100.0,99.0,1039.0,1036.7,9.0,8.0,36.5,44.7


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3271 entries, 0 to 3270
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           3271 non-null   object 
 1   MinTemp        3271 non-null   float64
 2   MaxTemp        3271 non-null   float64
 3   Rainfall       3271 non-null   float64
 4   Evaporation    3271 non-null   float64
 5   Sunshine       3271 non-null   float64
 6   WindGustDir    3271 non-null   object 
 7   WindGustSpeed  3271 non-null   int64  
 8   WindDir9am     3271 non-null   object 
 9   WindDir3pm     3271 non-null   object 
 10  WindSpeed9am   3271 non-null   int64  
 11  WindSpeed3pm   3271 non-null   int64  
 12  Humidity9am    3271 non-null   int64  
 13  Humidity3pm    3271 non-null   int64  
 14  Pressure9am    3271 non-null   float64
 15  Pressure3pm    3271 non-null   float64
 16  Cloud9am       3271 non-null   int64  
 17  Cloud3pm       3271 non-null   int64  
 18  Temp9am 

#### Checking for Null Values

In [8]:
df.isna().sum()

Date             0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

### Converting Categorical Values to Binary Value through One Hot Encoding

In [9]:
columns_to_encode = ['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm']

# Applying one-hot encoding
df_processed = pd.get_dummies(df, columns=columns_to_encode)
df_processed.replace(['No', 'Yes'], [0,1], inplace = True)
df_processed.drop(columns = 'Date', axis = 1, inplace = True)


In [10]:
# Check if 'RainTomorrow' exists in df_processed before dropping
if 'RainTomorrow' in df_processed.columns:
    df_processed.drop(columns='RainTomorrow', axis=1, inplace=True)

features = df_processed

# Assign 'RainTomorrow' column to Y
Y = df['RainTomorrow']

In [11]:
features.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,19.5,22.4,15.6,6.2,0.0,41,17,20,92,84,...,False,False,False,False,False,True,False,False,False,False
1,19.5,25.6,6.0,3.4,2.7,41,9,13,83,73,...,False,False,False,False,False,False,False,False,False,False
2,21.6,24.5,6.6,2.4,0.1,41,17,2,88,86,...,False,False,False,False,False,False,False,False,False,False
3,20.2,22.8,18.8,2.2,0.0,41,22,20,83,90,...,False,False,False,False,False,False,False,False,False,False
4,19.7,25.7,77.4,4.8,0.0,41,11,6,88,74,...,False,False,False,False,False,False,False,True,False,False


In [12]:
features.shape

(3271, 66)

In [13]:
Y.head()

0    Yes
1    Yes
2    Yes
3    Yes
4    Yes
Name: RainTomorrow, dtype: object

## Training Data and Testing Data

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, Y, test_size = 0.2, random_state = 42)

In [15]:
scaler = StandardScaler()

In [16]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

2. Train a Logistic Regression model, identify the best value of C parameter using GridSearchCV. Determine the accuracy of the test data. Also return Jaccard Index, LogLoss and F1-Score on the test data.

In [17]:
# Response to Query No. 02

from sklearn.linear_model import LogisticRegression

# Define the model
lr = LogisticRegression(solver='liblinear')

# Define the grid of parameters to search
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Perform GridSearchCV
grid = GridSearchCV(lr, param_grid, cv=5)
grid.fit(X_train, y_train)

# Print the best parameter
print("Best Parameter: ", grid.best_params_)

# Predict on the test data
y_pred = grid.predict(X_test)

# Calculate metrics
accuracy_lr = accuracy_score(y_test, y_pred)
jaccard_lr = jaccard_score(y_test, y_pred, average='weighted')
logloss_lr = log_loss(y_test, grid.predict_proba(X_test))
f1_lr = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy_lr}")
print(f"Jaccard Index: {jaccard_lr}")
print(f"LogLoss: {logloss_lr}")
print(f"F1-Score: {f1_lr}")


Best Parameter:  {'C': 0.1}
Accuracy: 0.8244274809160306
Jaccard Index: 0.7032445546913859
LogLoss: 0.4090542291281685
F1-Score: 0.8162377378227622


3. Train a K-nearest Neighbor model, find optimal values for the parameters n_neighbors = [1,3,5,7,9], algorithm, and p, using GridSearchCV. Determine the accuracy of the test data. Also return Jaccard Index and F1-Score on the test data.

In [18]:
# Response to Query No. 03

from sklearn.neighbors import KNeighborsClassifier

# Define the model
knn = KNeighborsClassifier()

# Define the grid of parameters to search
param_grid = {'n_neighbors': [1, 3, 5, 7, 9], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p': [1, 2]}

# Perform GridSearchCV
grid = GridSearchCV(knn, param_grid, cv=5)
grid.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters: ", grid.best_params_)

# Predict on the test data
y_pred = grid.predict(X_test)

# Calculate metrics
accuracy_knn = accuracy_score(y_test, y_pred)
jaccard_knn = jaccard_score(y_test, y_pred, average='weighted')
f1_knn = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy_knn}")
print(f"Jaccard Index: {jaccard_knn}")
print(f"F1-Score: {f1_knn}")


Best Parameters:  {'algorithm': 'auto', 'n_neighbors': 9, 'p': 1}
Accuracy: 0.7923664122137405
Jaccard Index: 0.6523531153445042
F1-Score: 0.772062589647838


4. Train a Support Vector Machine (SVM) model, find optimal values for parameters C = [.001, .01, .1, 1, 10, 100] and kernel=['linear', 'poly', 'rbf', 'sigmoid'], using GridSearchCV. Determine the accuracy of the test data. Also return the Jaccard Index and F1-Score on the test data.

In [19]:
# Response to Query No. 04

from sklearn.svm import SVC

# Define the model
svc = SVC(probability=True)

# Define the grid of parameters to search
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

# Perform GridSearchCV
grid = GridSearchCV(svc, param_grid, cv=4)
grid.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters: ", grid.best_params_)

# Predict on the test data
y_pred = grid.predict(X_test)

# Calculate metrics
accuracy_svc = accuracy_score(y_test, y_pred)
jaccard_svc = jaccard_score(y_test, y_pred, average='weighted')
logloss_svc = log_loss(y_test, grid.predict_proba(X_test))
f1_svc = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy_svc}")
print(f"Jaccard Index: {jaccard_svc}")
print(f"LogLoss: {logloss_svc}")
print(f"F1-Score: {f1_svc}")


Best Parameters:  {'C': 0.1, 'kernel': 'linear'}
Accuracy: 0.8198473282442749
Jaccard Index: 0.6950331487131142
LogLoss: 0.41034182119066065
F1-Score: 0.8092055467199705


5. Train a Decision Tree, find optimal values for parameters criterion=['gini', 'entropy'], using GridSearchCV. Determine the accuracy of the test data. Also return Jaccard Index and F1-Score on the test data.

In [20]:
# Response to Query No. 05

from sklearn.tree import DecisionTreeClassifier

# Define the model
dt = DecisionTreeClassifier()

# Define the grid of parameters to search
param_grid = {'criterion': ['gini', 'entropy']}

# Perform GridSearchCV
grid = GridSearchCV(dt, param_grid, cv=5)
grid.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters: ", grid.best_params_)

# Predict on the test data
y_pred = grid.predict(X_test)

# Calculate metrics
accuracy_dt = accuracy_score(y_test, y_pred)
jaccard_dt = jaccard_score(y_test, y_pred, average='weighted')
f1_dt = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy_dt}")
print(f"Jaccard Index: {jaccard_dt}")
print(f"F1-Score: {f1_dt}")


Best Parameters:  {'criterion': 'gini'}
Accuracy: 0.7709923664122137
Jaccard Index: 0.6407767350384592
F1-Score: 0.7709923664122138


6. Show the Accuracy, Jaccard, F1-Score and LogLoss in a tabular format using data frame for all of the above models.

In [21]:
import pandas as pd

# Define the data
data = {
    'Model': ['Logistic Regression', 'K-Nearest Neighbors', 'Support Vector Machine', 'Decision Tree'],
    'Accuracy': [round(accuracy_lr, 2), round(accuracy_knn, 2), round(accuracy_svc, 2), round(accuracy_dt, 2)],  
    'Jaccard Index': [round(jaccard_lr, 2), round(jaccard_knn, 2), round(jaccard_svc, 2), round(jaccard_dt, 2)], 
    'F1-Score': [round(f1_lr, 2), round(f1_knn, 2), round(f1_svc, 2), round(f1_dt, 2)], 
    'LogLoss': [round(logloss_lr, 2), None, round(logloss_svc, 2), None]
}


df = pd.DataFrame(data)

df


Unnamed: 0,Model,Accuracy,Jaccard Index,F1-Score,LogLoss
0,Logistic Regression,0.82,0.7,0.82,0.41
1,K-Nearest Neighbors,0.79,0.65,0.77,
2,Support Vector Machine,0.82,0.7,0.81,0.41
3,Decision Tree,0.77,0.64,0.77,
