In [3]:
# Author: Hassan Ali
# Import libraries 
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier

# Loading dataset 
df = pd.read_csv('Bias_correction_ucl.csv')

# Dropping specific uneeded columns ('station', and 'Date')
df = df.drop(columns=['station', 'Date'])
# Removing rows with negative values and then drop any rows with missing values
df = df[df >= 0].dropna()
# Printing the cleaned DataFrame
print(df)

      Present_Tmax  Present_Tmin  LDAPS_RHmin  LDAPS_RHmax  LDAPS_Tmax_lapse  \
0             28.7          21.4    58.255688    91.116364         28.074101   
1             31.9          21.6    52.263397    90.604721         29.850689   
2             31.6          23.3    48.690479    83.973587         30.091292   
3             32.0          23.4    58.239788    96.483688         29.704629   
4             31.4          21.9    56.174095    90.155128         29.113934   
...            ...           ...          ...          ...               ...   
7746          22.5          17.4    30.094858    83.690018         26.704905   
7747          23.3          17.1    26.741310    78.869858         26.352081   
7748          23.3          17.7    24.040634    77.294975         27.010193   
7749          23.2          17.4    22.933014    77.243744         27.939516   
7751          37.6          29.9    98.524734   100.000153         38.542255   

      LDAPS_Tmin_lapse   LDAPS_WS    LD

In [5]:
# Preparing the feature matrix 'X' by dropping the column 'Next_Tmax' from the DataFrame
X = df.drop(columns=['Next_Tmax'])
# Get the target variable 'y' ready by converting 'Next_Tmax' values to binary classes
# np.where function is used here to verify each value within the 'Next_Tmax' column of 'df' DataFrame.
# The test condition is if all values are greater than 30.
# If the condition is fulfilled (value > 30), a 1 is placed in the corresponding position of the new array 'y'.
# If the condition is not fulfilled (value <= 30), then 0 is stored in the relative position of the new array 'y'.
y = np.where(df['Next_Tmax'] > 30, 1, 0)

# Initializing SelectKBest to select the top 5 features based on the chi-squared test
X_new = SelectKBest(chi2, k=5).fit_transform(X, y)
# Printing the shape of the transformed feature matrix to show the number of features selected
print(X_new.shape)

(7586, 5)


In [7]:
# Creating an Extra Trees classifier. 
# This classifier fits several randomized decision trees (a forest) on various sub-samples of the dataset
# and uses averaging to improve the predictive accuracy and prevent over-fitting.
#'n_estimators=100' means 100 trees have to be created in this forest
#'random_state=42' controls the randomness to be deterministic such that results can be reproduced whenever the code is run.
clf = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Training the classifier. 
# 'fit' method is used to train the model using the training data. 
# 'X' represents the features from the training data, and 'y' represents the labels.
clf = clf.fit(X, y)

# Get the indices of the features sorted by their importance in ascending order
tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
# Print the indices of the sorted features
print(tree_importance_sorted_idx)

[19 16 15 17 14 18 12 13  7  3  6  1 20  2 11  8  0  5 10  9 21  4]


In [None]:
# This list of indices is in order of feature importance from least to most
# important. The feature at index 19 is the least important, and the feature at index 4 is
# the most important in assisting the model in making its decisions.