In [2]:
# Importing libraries and packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

pd.pandas.set_option("display.max_columns", None, "display.max_rows", None)

In [3]:
#Loading dataset
#df = pd.read_csv("/content/drive/MyDrive/Weather Forecasting/cleaned_weather.csv")
df = pd.read_csv("cleaned_weather.csv")

In [3]:
df.head()

Unnamed: 0,time,weathercode,temperature_2m_max,temperature_2m_min,temperature_2m_mean,shortwave_radiation_sum,rain_sum,precipitation_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,et0_fao_evapotranspiration,city
0,2010-01-01,2,30.0,22.7,26.1,20.92,0.0,0.0,11.7,27.4,20,4.58,Colombo
1,2010-01-02,51,29.9,23.5,26.2,17.71,0.1,1.0,13.0,27.0,24,3.84,Colombo
2,2010-01-03,51,29.5,23.2,26.0,17.76,0.6,3.0,12.3,27.4,16,3.65,Colombo
3,2010-01-04,2,28.9,21.9,25.3,16.5,0.0,0.0,17.0,34.6,356,3.79,Colombo
4,2010-01-05,1,28.1,21.3,24.5,23.61,0.0,0.0,18.7,37.1,355,4.97,Colombo


In [4]:
df.shape

(147480, 13)

In [5]:
df.dtypes

time                           object
weathercode                     int64
temperature_2m_max            float64
temperature_2m_min            float64
temperature_2m_mean           float64
shortwave_radiation_sum       float64
rain_sum                      float64
precipitation_hours           float64
windspeed_10m_max             float64
windgusts_10m_max             float64
winddirection_10m_dominant      int64
et0_fao_evapotranspiration    float64
city                           object
dtype: object

In [4]:
# Change datatypes

# Change datatype of the 'date_column' to datetime
df['time'] = pd.to_datetime(df['time'])

#df['weathercode'] = df['weathercode'].astype(str)

# Print the updated DataFrame
print(df['time'].dtypes)
print(df['weathercode'].dtypes)

datetime64[ns]
int64


Creating a month variable

In [5]:
# Extract the month and create a new column
df['month'] = df['time'].dt.month

# Print the DataFrame with the added 'month' column
print(df.head())

        time  weathercode  temperature_2m_max  temperature_2m_min  \
0 2010-01-01            2                30.0                22.7   
1 2010-01-02           51                29.9                23.5   
2 2010-01-03           51                29.5                23.2   
3 2010-01-04            2                28.9                21.9   
4 2010-01-05            1                28.1                21.3   

   temperature_2m_mean  shortwave_radiation_sum  rain_sum  \
0                 26.1                    20.92       0.0   
1                 26.2                    17.71       0.1   
2                 26.0                    17.76       0.6   
3                 25.3                    16.50       0.0   
4                 24.5                    23.61       0.0   

   precipitation_hours  windspeed_10m_max  windgusts_10m_max  \
0                  0.0               11.7               27.4   
1                  1.0               13.0               27.0   
2                  3.0    

In [6]:
df.dtypes

time                          datetime64[ns]
weathercode                            int64
temperature_2m_max                   float64
temperature_2m_min                   float64
temperature_2m_mean                  float64
shortwave_radiation_sum              float64
rain_sum                             float64
precipitation_hours                  float64
windspeed_10m_max                    float64
windgusts_10m_max                    float64
winddirection_10m_dominant             int64
et0_fao_evapotranspiration           float64
city                                  object
month                                  int64
dtype: object

In [6]:
df['month'] = df['month'].astype(str)

# Print the updated DataFrame
print(df['month'].dtypes)
print(df['month'].value_counts())

object
1     13020
3     13020
5     13020
4     12600
6     12210
7     12090
8     12090
10    12090
12    12090
2     11850
9     11700
11    11700
Name: month, dtype: int64


In [7]:
df = df.drop('time', axis = 1)

In [8]:
df = df.drop('temperature_2m_mean', axis = 1)

## Recoding wind direction

In [9]:
# Function to recode wind directions
def recode_wind_direction(degrees):
    if (337.5 <= degrees < 360) or (0 <= degrees < 22.5):
        return "North"
    elif 22.5 <= degrees < 67.5:
        return "Northeast"
    elif 67.5 <= degrees < 112.5:
        return "East"
    elif 112.5 <= degrees < 157.5:
        return "Southeast"
    elif 157.5 <= degrees < 202.5:
        return "South"
    elif 202.5 <= degrees < 247.5:
        return "Southwest"
    elif 247.5 <= degrees < 292.5:
        return "West"
    else:
        return "Northwest"

# Apply the function to create the new column
df['Wind_direction'] = df['winddirection_10m_dominant'].apply(recode_wind_direction)

In [10]:
df = df.drop('winddirection_10m_dominant', axis = 1)

In [11]:
df.shape

(147480, 13)

In [15]:
df.head()

Unnamed: 0,weathercode,temperature_2m_max,temperature_2m_min,shortwave_radiation_sum,rain_sum,precipitation_hours,windspeed_10m_max,windgusts_10m_max,et0_fao_evapotranspiration,city,month,Wind_direction
0,2,30.0,22.7,20.92,0.0,0.0,11.7,27.4,4.58,Colombo,1,North
1,51,29.9,23.5,17.71,0.1,1.0,13.0,27.0,3.84,Colombo,1,Northeast
2,51,29.5,23.2,17.76,0.6,3.0,12.3,27.4,3.65,Colombo,1,North
3,2,28.9,21.9,16.5,0.0,0.0,17.0,34.6,3.79,Colombo,1,North
4,1,28.1,21.3,23.61,0.0,0.0,18.7,37.1,4.97,Colombo,1,North


## Modelling

In [11]:
categorical_features = ['weathercode', 'city','month','Wind_direction']
numerical_features = [col for col in df.columns if col not in categorical_features]

print('categorical features : ' , categorical_features)
print('numerical features : ', numerical_features)

categorical features :  ['weathercode', 'city', 'month', 'Wind_direction']
numerical features :  ['temperature_2m_max', 'temperature_2m_min', 'shortwave_radiation_sum', 'rain_sum', 'precipitation_hours', 'windspeed_10m_max', 'windgusts_10m_max', 'et0_fao_evapotranspiration']


In [17]:
df[numerical_features].describe()

Unnamed: 0,temperature_2m_max,temperature_2m_min,shortwave_radiation_sum,rain_sum,precipitation_hours,windspeed_10m_max,windgusts_10m_max,et0_fao_evapotranspiration
count,147480.0,147480.0,147480.0,147480.0,147480.0,147480.0,147480.0,147480.0
mean,29.242367,23.899913,18.483513,5.975637,8.762524,15.561288,34.796932,3.896143
std,2.120345,2.059291,4.21427,10.215294,7.19101,6.213737,10.214372,1.023126
min,17.5,12.1,1.23,0.0,0.0,2.2,11.5,0.42
25%,28.2,23.1,16.22,0.4,2.0,11.0,26.6,3.26
50%,29.2,24.3,19.13,2.6,8.0,15.0,33.1,3.89
75%,30.4,25.2,21.51,7.5,14.0,18.9,41.8,4.55
max,37.5,30.0,28.98,338.8,24.0,50.2,91.8,8.14


## Creating dummy variables

In [12]:
# select the columns to be encoded
cols_to_encode = ['city','month','Wind_direction']
# perform one-hot encoding on the selected columns
encoded_data = pd.get_dummies(df[cols_to_encode], prefix_sep='_', drop_first=True)

# add the encoded variables to the original dataframe
df1 = pd.concat([df, encoded_data], axis=1)

# Drop original variables from dataframe
df1 = df1.drop(cols_to_encode, axis=1)


In [19]:
df1.shape

(147480, 56)

In [20]:
df1.head()

Unnamed: 0,weathercode,temperature_2m_max,temperature_2m_min,shortwave_radiation_sum,rain_sum,precipitation_hours,windspeed_10m_max,windgusts_10m_max,et0_fao_evapotranspiration,city_Badulla,city_Bentota,city_Colombo,city_Galle,city_Gampaha,city_Hambantota,city_Hatton,city_Jaffna,city_Kalmunai,city_Kalutara,city_Kandy,city_Kesbewa,city_Kolonnawa,city_Kurunegala,city_Mabole,city_Maharagama,city_Mannar,city_Matale,city_Matara,city_Moratuwa,city_Mount Lavinia,city_Negombo,city_Oruwala,city_Pothuhera,city_Puttalam,city_Ratnapura,city_Sri Jayewardenepura Kotte,city_Trincomalee,city_Weligama,month_10,month_11,month_12,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,Wind_direction_North,Wind_direction_Northeast,Wind_direction_Northwest,Wind_direction_South,Wind_direction_Southeast,Wind_direction_Southwest,Wind_direction_West
0,2,30.0,22.7,20.92,0.0,0.0,11.7,27.4,4.58,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,51,29.9,23.5,17.71,0.1,1.0,13.0,27.0,3.84,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,51,29.5,23.2,17.76,0.6,3.0,12.3,27.4,3.65,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,2,28.9,21.9,16.5,0.0,0.0,17.0,34.6,3.79,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,28.1,21.3,23.61,0.0,0.0,18.7,37.1,4.97,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


### It take too much time to train whole dataset. Hence a random sample is taken for model fitting.

In [21]:
sample_size = 30000  #the desired sample size

# Take a random sample from the dataset
sample = df1.sample(n=sample_size, random_state=42)  # Set random_state for reproducibility
#df2 = sample.copy()
# Print the random sample
print(sample.shape)


(30000, 56)


In [13]:
dropping_variables = ['et0_fao_evapotranspiration']

df2 = df1.drop(dropping_variables, axis=1)


In [14]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

## Initialize X, y and split dataset
y = df2['weathercode']
X = df2.drop('weathercode', axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [24]:
x_train.head()

Unnamed: 0,temperature_2m_max,temperature_2m_min,shortwave_radiation_sum,rain_sum,precipitation_hours,windspeed_10m_max,windgusts_10m_max,city_Badulla,city_Bentota,city_Colombo,city_Galle,city_Gampaha,city_Hambantota,city_Hatton,city_Jaffna,city_Kalmunai,city_Kalutara,city_Kandy,city_Kesbewa,city_Kolonnawa,city_Kurunegala,city_Mabole,city_Maharagama,city_Mannar,city_Matale,city_Matara,city_Moratuwa,city_Mount Lavinia,city_Negombo,city_Oruwala,city_Pothuhera,city_Puttalam,city_Ratnapura,city_Sri Jayewardenepura Kotte,city_Trincomalee,city_Weligama,month_10,month_11,month_12,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,Wind_direction_North,Wind_direction_Northeast,Wind_direction_Northwest,Wind_direction_South,Wind_direction_Southeast,Wind_direction_Southwest,Wind_direction_West
9789,30.8,25.2,16.47,9.1,23.0,16.9,31.7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
7329,29.2,25.6,21.42,2.8,12.0,20.2,44.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
141361,32.5,25.1,22.37,0.0,0.0,20.2,41.8,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
118585,28.2,22.7,16.98,8.6,13.0,18.4,44.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
23944,29.1,24.3,19.54,7.6,17.0,11.3,27.4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0


In [25]:
x_train.shape

(117984, 54)

In [26]:
from sklearn.ensemble import RandomForestRegressor

# Assuming df_train3 is your pandas DataFrame

np.random.seed(12)
# Create a Random Forest regressor
rf = RandomForestRegressor()

# Fit the Random Forest model
rf.fit(X, y)

# Get the feature importances
importances = rf.feature_importances_

# Sort the feature importances in descending order
sorted_indices = importances.argsort()[::-1]

# Select the top 30 most important features
top_features = X.columns[sorted_indices[:25]].to_list()

# Print the top 20 features
#print(top_features)


In [27]:
from prettytable import PrettyTable

# Create a table of importances
table = PrettyTable()
table.field_names = ["Feature", "Importance"]

# Populate the table with feature names and their importance values
for feature, importance in zip(X.columns[sorted_indices], importances[sorted_indices]):
    table.add_row([feature, importance])

# Print the table
print(table)

+--------------------------------+------------------------+
|            Feature             |       Importance       |
+--------------------------------+------------------------+
|            rain_sum            |   0.504583504807484    |
|      precipitation_hours       |  0.49095822614858703   |
|    shortwave_radiation_sum     | 0.0009209638472036783  |
|       windspeed_10m_max        | 0.0007236286405892145  |
|       windgusts_10m_max        | 0.0007048140290922822  |
|       temperature_2m_max       | 0.0004383567657722477  |
|       temperature_2m_min       | 0.00043192907244833835 |
|      Wind_direction_South      | 6.739118743104229e-05  |
|    Wind_direction_Southwest    | 6.520259837277088e-05  |
|      Wind_direction_West       | 5.841164503593098e-05  |
|            month_7             | 5.703744532117465e-05  |
|            month_5             | 5.613069496895906e-05  |
|            month_10            | 5.5413731436229495e-05 |
|            month_4             | 5.532

In [17]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

# Scale the independent variables
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)

# Convert the NumPy array x_train_scaled to a pandas DataFrame
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train.columns)

# Calculate the VIF values for each independent variable
vif = pd.Series([variance_inflation_factor(x_train_scaled_df.values, i) for i in range(x_train_scaled_df.shape[1])], index=x_train_scaled_df.columns)

# Print the VIF values
print(vif)



temperature_2m_max                18.963161
temperature_2m_min                19.422585
temperature_2m_mean               52.698840
shortwave_radiation_sum           14.613799
rain_sum                           1.860026
precipitation_hours                3.576459
windspeed_10m_max                 11.545794
windgusts_10m_max                  8.256184
et0_fao_evapotranspiration        29.555893
city_Badulla                       3.328669
city_Bentota                       1.987266
city_Colombo                       1.965219
city_Galle                         2.152198
city_Gampaha                       1.944783
city_Hambantota                    2.167180
city_Hatton                        5.794673
city_Jaffna                        3.307045
city_Kalmunai                      2.399611
city_Kalutara                      1.966395
city_Kandy                         3.008072
city_Kesbewa                       1.960428
city_Kolonnawa                     1.937277
city_Kurunegala                 

### Base random forest model

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.ensemble import BalancedRandomForestClassifier

In [30]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)

RandomForestClassifier()

In [31]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [32]:
#for rf model
y_pred_rf_train = rf.predict(x_train)

# Calculate the accuracy score
accuracy = accuracy_score(y_train, y_pred_rf_train)

# Print the accuracy score
print("Train Accuracy:", accuracy)

Train Accuracy: 1.0


In [19]:
#Creating a function to check accuracy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_classification_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')
    
    return
calculate_classification_metrics(y_train, y_pred_rf_train)

In [34]:
#For rf model
y_pred_rf_test = rf.predict(x_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred_rf_test)

# Print the accuracy score
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8540141036072688


In [35]:
#Checking accuracies

calculate_classification_metrics(y_test, y_pred_rf_test)


Precision: 0.8569
Recall: 0.8540
F1-Score: 0.8510


In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)



DecisionTreeClassifier()

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#for rf model
y_pred_dtc_train = dtc.predict(x_train)

# Calculate the accuracy score
accuracy = accuracy_score(y_train, y_pred_dtc_train)

# Print the accuracy score
print("Train Accuracy:", accuracy)

Train Accuracy: 1.0


In [18]:
#For rf model
y_pred_dtc_test = dtc.predict(x_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred_dtc_test)

# Print the accuracy score
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8282139951179821


In [20]:
calculate_classification_metrics(y_test, y_pred_dtc_test)

Precision: 0.8286
Recall: 0.8282
F1-Score: 0.8283


### It seems to be overfitting

## Handling imbalace

In [36]:
y_train.value_counts()

51    30105
63    22014
61    18147
53    17730
2      8387
1      7555
55     6198
3      4689
65     2344
0       815
Name: weathercode, dtype: int64

In [37]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

def balance_and_evaluate():
   
    # Lists to store evaluation metrics
    techniques = []
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    # Balancing techniques
    samplers = {
        'Oversampling': RandomOverSampler(random_state=42),
        'Undersampling': RandomUnderSampler(random_state=42),
        'SMOTE': SMOTE(random_state=42)
    }
    
    # Train and evaluate Random Forest classifiers using different balancing techniques
    for technique, sampler in samplers.items():
        # Resample the training data using the sampler
        X_resampled, y_resampled = sampler.fit_resample(x_train, y_train)
        
        # Train the Random Forest classifier
        rf_classifier = RandomForestClassifier(random_state=42)
        rf_classifier.fit(X_resampled, y_resampled)
        
        # Predict on the test set
        y_pred = rf_classifier.predict(x_test)
        
        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Store metrics in lists
        techniques.append(technique)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
    
        # Create a DataFrame to store the metrics
        metrics_df = pd.DataFrame({
            'Technique': techniques,
            'Accuracy': accuracies,
            'Precision': precisions,
            'Recall': recalls,
            'F1-Score': f1_scores
        })

    return metrics_df


In [39]:
balance_and_evaluate()

Unnamed: 0,Technique,Accuracy,Precision,Recall,F1-Score
0,Oversampling,0.853743,0.85477,0.853743,0.852278
1,Undersampling,0.616355,0.656118,0.616355,0.626588
2,SMOTE,0.850149,0.849638,0.850149,0.849347


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

def balance_and_evaluate2():
   
    # Lists to store evaluation metrics
    techniques = []
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    # Balancing techniques
    samplers = {
        'Oversampling': RandomOverSampler(random_state=42),
        'Undersampling': RandomUnderSampler(random_state=42),
        'SMOTE': SMOTE(random_state=42)
    }
    
    # Train and evaluate Random Forest classifiers using different balancing techniques
    for technique, sampler in samplers.items():
        # Resample the training data using the sampler
        X_resampled, y_resampled = sampler.fit_resample(x_train, y_train)
        
        # Train the Random Forest classifier
        rf_classifier = RandomForestClassifier(random_state=42)
        rf_classifier.fit(X_resampled, y_resampled)
        
        # Predict on the test set
        y_pred_train = rf_classifier.predict(X_resampled)
        
        # Calculate evaluation metrics
        accuracy = accuracy_score(y_resampled, y_pred_train)
        precision = precision_score(y_resampled, y_pred_train, average='weighted')
        recall = recall_score(y_resampled, y_pred_train, average='weighted')
        f1 = f1_score(y_resampled, y_pred_train, average='weighted')
        
        # Store metrics in lists
        techniques.append(technique)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
    
        # Create a DataFrame to store the metrics
        metrics_df = pd.DataFrame({
            'Technique': techniques,
            'Accuracy': accuracies,
            'Precision': precisions,
            'Recall': recalls,
            'F1-Score': f1_scores
        })

    return metrics_df


In [41]:
balance_and_evaluate2()

Unnamed: 0,Technique,Accuracy,Precision,Recall,F1-Score
0,Oversampling,1.0,1.0,1.0,1.0
1,Undersampling,1.0,1.0,1.0,1.0
2,SMOTE,1.0,1.0,1.0,1.0


## Bagging

In [42]:
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(n_estimators=50), X, y, cv=5)
scores.mean()

0.7552685109845403

In [None]:
scores = cross_val_score(bag_model, X, y, cv=5)
scores.mean()

In [None]:
bag_model.score(x_test, y_test)

In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

scores = cross_val_score(DecisionTreeClassifier(), x_train, y_train, cv=5)
scores.mean()

0.8227132550940507

In [21]:
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
y_pred_bag = bag_model.fit(x_train, y_train)
print(bag_model.oob_score_)

0.8706943314347708


In [22]:
bag_model.score(x_test, y_test)

0.8686262544073773

In [23]:
y_pred_dtc_bag = bag_model.predict(x_test)

In [26]:
calculate_classification_metrics(y_test, y_pred_dtc_bag)

Precision: 0.8694
Recall: 0.8686
F1-Score: 0.8673


In [57]:
bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
scores = cross_val_score(bag_model, X, y, cv=5)
scores

array([0.89405343, 0.79353133, 0.82807838, 0.78875102, 0.78251288])

## Cross Validation

## Grid Search CV for hyperparameter tuning

In [44]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 40, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [45]:
# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
#print(param_grid)

In [46]:
rf3 = RandomForestClassifier()

In [62]:
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rf3, param_grid = param_grid, cv = 10, verbose=2, n_jobs = 4)

In [63]:
rf_Grid.fit(x_train,y_train)
rf_Grid.best_params_

Fitting 10 folds for each of 320 candidates, totalling 3200 fits


{'bootstrap': False,
 'max_depth': 4,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 60}

In [64]:
y_pred_rf_grid3 = rf_Grid.predict(x_test)

In [65]:
#Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred_rf_grid3)

# Print the accuracy score
print("Test Accuracy:", accuracy)

Test Accuracy: 0.5181380526173041


In [66]:
#Checking accuracies

calculate_classification_metrics(y_test, y_pred_rf_grid3)

Precision: 0.3916
Recall: 0.5181
F1-Score: 0.3875


  _warn_prf(average, modifier, msg_start, len(result))


## KNN Classifier

In [52]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=10)
knn.score(x_train, y_train)

0.8092877000271224

In [60]:
y_pred_knn_train = knn.predict(x_train)

In [61]:
calculate_classification_metrics(y_train, y_pred_knn_train)

Precision: 0.8105
Recall: 0.8093
F1-Score: 0.8071


In [53]:
knn.score(x_test, y_test)

0.737286411716843

In [58]:
y_pred_knn_test = knn.predict(x_test)

In [59]:
calculate_classification_metrics(y_test, y_pred_knn_test)

Precision: 0.7370
Recall: 0.7373
F1-Score: 0.7340
