In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import tensorflow as tf
import tensorflow.keras as keras
from keras.callbacks import ModelCheckpoint
from sklearn import metrics, preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

Data Set Analysis and getting to know the data better by generating the graphs  by cause of fire and which region is getting effected by the fire  mostly.

In [None]:
df = pd.read_csv('wildfires.csv')
df.head()

In [None]:
# subset_df = df[["stat_cause_descr","latitude","longitude","state","disc_clean_date","fire_size" ]]
# print(subset_df.head())
# subset_df['stat_cause_descr'].value_counts().plot(kind='bar',color='blue')
# plt.xlabel("Cause of fires")
# plt.ylabel("Number of fires")
# plt.show()

subset_df = df[["stat_cause_descr", "latitude", "longitude", "state", "disc_clean_date", "fire_size"]]
print(subset_df.head())
subset_df['stat_cause_descr'].value_counts().plot(kind='bar', color='lightblue')
plt.xlabel("Cause of fires")
plt.ylabel("Number of fires")
plt.title("Number of Fires by Cause")
plt.show()


In [None]:
subset_df['state'].value_counts().head(n=10).plot(kind='bar',color='lightblue')
plt.title("Top states")
plt.ylabel("Number of fires")
plt.show()


In [None]:
df_CA = subset_df[subset_df['state']=='CA']
df_GA = subset_df[subset_df['state']=='GA']
df_TX = subset_df[subset_df['state']=='TX']

In [None]:
df_GA['stat_cause_descr'].value_counts().plot(kind='bar',color='lightblue',title='causes of fires for GA')
plt.show()

In [None]:
df_TX['stat_cause_descr'].value_counts().plot(kind='bar',color='lightblue',title='causes of fires for TX')
plt.show()

In [None]:
df_CA['stat_cause_descr'].value_counts().plot(kind='bar',color='lightblue',title='causes of fires for CA')
plt.show()

In [None]:
df_natural = subset_df[subset_df['stat_cause_descr']=='Lightning'].size
df_artificial = subset_df[subset_df['stat_cause_descr']!='Lightning'].size
df_misc = subset_df[subset_df['stat_cause_descr'] == 'Miscellaneous'].size
df_misc+= subset_df[subset_df['stat_cause_descr'] == 'Missing/Undefined'].size
# df_natural['stat_cause_descr'].value_counts().plot(kind='bar',color='forestgreen',title='causes of fires because of natural reasons')
# df_artificial['stat_cause_descr'].value_counts().plot(kind='bar',color='forestgreen',title='causes of fires because of artificial reasons')
data = {'Category':['Natural', 'Man made','Miscellaneous'],
        'Values':[df_natural, df_artificial,df_misc]}
df_1 = pd.DataFrame(data)
plt.bar(df_1['Category'],df_1['Values'],color = 'lightblue')
plt.show()
# df_lightning.plot(x='state',color ='forestgreen')

In [None]:
# subset_df.plot(kind='scatter',x='longitude',y='latitude',color='coral',alpha=0.3)
# plt.show()

### The above are the key insights of how fires are in the US states and what is the main cause of the fires based on the state.

Below are the Pre-processing steps for the preprocessing the dataset.

## Data Pre-processing and EDA(Exploratory data analysis)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.head()

#### Data Cleaning and filtering data which has firesize <5000 as number of small fires are high.

In [None]:
# Reading the combined CSV files
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1','disc_date_final','cont_date_final','cont_clean_date','putout_time'])
df['disc_clean_date'] = pd.to_datetime(df['disc_clean_date'], format='%m/%d/%Y')

#Get rid of outliers - fires of size larger than 5000 acres, and there are large number of small fires and other very less number are having the high 
# area of fires, because of which the deviation is very high
df = df.loc[df['fire_size'] < 5000]
df.columns

In [None]:
#As these fieds are categorial, trying to change them into boolean using the one-hot encoding.
# So that it will be easy to train.

df['Vegetation'] = df['Vegetation'].astype('category')
df['Cause'] = df['stat_cause_descr'].astype('category')

df = pd.get_dummies(df,prefix=['Vegetation'], columns = ['Vegetation'], drop_first=True)
df = pd.get_dummies(df,prefix=['Cause'], columns = ['stat_cause_descr'], drop_first=True)


In [None]:
df_numerics_only = df.select_dtypes(include=np.number)

corr = df_numerics_only.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(220, 20, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
sns.set(rc={'figure.figsize':(15,15)})

In [None]:
### Dealing with missing data
print(len(df))

# drop columns where weather_file is missing in the data, as it wont have the weather situation at that time, so its where ever data is 
#missing we can remove those rows as it wont be useful
index = df[df['weather_file'] == 'File Not Found'].index
df.drop(index, inplace = True)
print(len(df))


In [None]:
# Weather data has a lot of 0 and values some of which may be missing values,
# Mark '0' values in weather columns as Na (to see how many there are) 
# As 0 wont add any value to the data, we are converting to NA and then removing them which will make data set
subset0 = ['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont']
df[subset0] = df[subset0].replace({0:np.nan, '0':np.nan})
print(len(df))

# Mark '-1' as missing
subset_neg1 = ['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont']
df[subset_neg1] = df[subset_neg1].replace({-1:np.nan})

# Drop observations where all weather columns are 0
df = df.dropna(how='all',
                    subset=['Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont',])
print(len(df))
# This leaves us with 38,689 observations  +/- 3,000  to work with (originally we had 50,000)

In [None]:
# fill the 'pre' columns temp wind and humidity with mean values
subset_fill_mean = ['Temp_pre_30','Temp_pre_15','Temp_pre_7', 'Wind_pre_30','Wind_pre_15','Wind_pre_7', 'Hum_pre_30', 'Hum_pre_15','Hum_pre_7']
df[subset_fill_mean] = df[subset_fill_mean].fillna(df[subset_fill_mean].mean())

# Fill NAs in the date of fire containment based on mean values from previous days
for col in ['Temp','Wind','Hum']:
    df[f'{col}_cont'] = df.apply(
        lambda row: (row[f'{col}_pre_7']+row[f'{col}_pre_15']+row[f'{col}_pre_30'])/3 if np.isnan(row[f'{col}_cont']) else row[f'{col}_cont'],
        axis=1)

        

## Separation of Experiments into 4 types - for better understanding of the effects on dataset


### Experiment 1 - which will select all the available  features from the dataset



In [None]:
# Experiment 1 - which will select all teh available  features from the dataset
#Features included - variables related to Vegetation,Temperature, Humidity, Wind, Precipitation, cause of  fire, longitude and latitude
# we have 34 variables  for x-variables  to which we are gonna target one y-variable which is fire_size
# selecting features and target variables
X1 = df[['Vegetation_4','remoteness', 'Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
# X1 = df[['Vegetation_4','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]

y = df['fire_size']

#train test split
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)
df1 = [X1_train, X1_test, y_train, y_test]



### Experiment  type 2 :  Include only long, lat, vegetation, cause and pre- weather data, without cont

In [None]:
# which is the data set where I removed the variables  on which the fire is  containining on the day
# removed 4 variables
# selecting features and target variables
X2 = df[['Vegetation_4','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
y = df['fire_size']

#train test split
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)
df2 = [X2_train, X2_test, y_train, y_test]

### Experiment 3 - Including only lat, long and weather pre- data

In [None]:
#When I have done the feature importance, I got to know that the cause and vegetation is not that important, so here we removed the 2 
#  selecting features and target variables
X3 = df[['latitude','longitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Wind_pre_30','Wind_pre_15','Wind_pre_7','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Prec_pre_30','Prec_pre_15','Prec_pre_7']]
y = df['fire_size']

#train test split
X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size=0.2, random_state=42)
df3 = [X3_train, X3_test, y_train, y_test]

### Experiment 4 :-  with experiment 1 data with normalization


In [None]:
# have done the minMax normalization for the experiment 1 data frame.

df_4 = df[['Vegetation_4','remoteness','Vegetation_9','Vegetation_12','Vegetation_14','Vegetation_15','Vegetation_16','latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','Cause_Debris Burning','Cause_Equipment Use','Cause_Fireworks','Cause_Lightning','Cause_Miscellaneous','Cause_Missing/Undefined','Cause_Powerline','Cause_Railroad','Cause_Smoking','Cause_Structure', 'longitude']]
names = df_4.columns

# normalizing data
df_4 = preprocessing.normalize(df_4)
scaled_df = pd.DataFrame(df_4, columns=names)

#train test split
X4_train, X4_test, y_train, y_test = train_test_split(scaled_df, y, test_size=0.2, random_state=42)
df4 = [X4_train, X4_test, y_train, y_test]

# Applying different Models: 
### The above experiments with different models like  decision tree,gradient bosting, random  forest


#### Decision Tree - Experiment 1


In [None]:
dectr = DecisionTreeRegressor(random_state=0)
dectr.fit(df1[0], df1[2])

predictions = dectr.predict(df1[1])
print('Mean Absolute Error:', metrics.mean_absolute_error(df1[3], predictions))
print('R Squared:', metrics.r2_score(df1[3], predictions))


#### Gradient Boosting - Experiment 1



In [None]:
gr_boost = GradientBoostingRegressor()
gr_boost.fit(df1[0], df1[2])

predictions = gr_boost.predict(df1[1])
print('Mean Absolute Error:', metrics.mean_absolute_error(df1[3], predictions))
print('R Squared:', metrics.r2_score(df1[3], predictions))


#### Random Forest - Experiment 1

In [None]:
rf = RandomForestRegressor()
rf.fit(df1[0], df1[2])

predictions = rf.predict(df1[1])
print('Mean Absolute Error:', metrics.mean_absolute_error(df1[3], predictions))
print('R Squared:', metrics.r2_score(df1[3], predictions))


## Modelling all the experirments by printing the results at one place


In [None]:
# less than 5000
for model in [DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor()]:
    for idx,df in enumerate([df1, df2, df3, df4]):
        model.fit(df[0], df[2])
        print(f'{model}; Experiment {idx+1}; Mean Absolute Error:', metrics.mean_absolute_error(df[3], model.predict(df[1])))
        print(f'{model}; Experiment {idx+1}; R Squared:', metrics.r2_score(df[3], model.predict(df[1])))
        print('')


## Results :
The best performing basemodel is the Random Forest algorithm with Experiment 1. This is the model we will use for further analysis and improvement.


# Feature Importance

What features are the most influential in the model?

In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_,
    index = df1[0].columns,columns=['importance']).sort_values('importance',ascending=False)
feature_importances

## Hyper-parameter tuning for better results

In [None]:
rf_reg = RandomForestRegressor()
search_grid={'n_estimators':[50,100,200],'max_depth':[2,5,8,10]}
search=GridSearchCV(estimator=rf_reg,param_grid=search_grid,scoring='neg_mean_absolute_error',n_jobs=1,cv=5, verbose=1)
search.fit(df1[0], df1[2])
print(search.best_score_)
print(search.best_params_)


In [None]:
# Random Forest Regressor 
rf_reg = RandomForestRegressor(n_estimators = 200, max_depth=10)

# fit the regressor with x and y data
rf_reg.fit(df1[0], df1[2])

predictions = rf_reg.predict(df1[1])
print('Mean Absolute Error:', metrics.mean_absolute_error(df1[3], predictions))
print('R Squared:', metrics.r2_score(df1[3], predictions))

In [None]:
X = df[['latitude','Temp_pre_30','Temp_pre_15','Temp_pre_7','Temp_cont','Wind_pre_30','Wind_pre_15','Wind_pre_7','Wind_cont','Hum_pre_30', 'Hum_pre_15','Hum_pre_7','Hum_cont','Prec_pre_30','Prec_pre_15','Prec_pre_7','Prec_cont','longitude']]

y = df['fire_size']

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




# Neural network model and its results


In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(X_train))

In [None]:
X_train.shape

In [None]:
model = Sequential([
    #normalizer,
    Dense(34, activation="relu", kernel_regularizer = regularizers.l2(0.01), kernel_initializer='normal',input_dim = X_train.shape[1]),
    Dense(34, activation="relu", kernel_regularizer = regularizers.l2(0.01)),
    Dense(64, activation="relu", kernel_regularizer = regularizers.l2(0.01)),
    Dense(32, activation="relu", kernel_regularizer = regularizers.l2(0.01)),
    Dense(1, activation='linear')
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['accuracy','mae']
)

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=150,
    validation_split=0.15,
    callbacks=[tf.keras.callbacks.EarlyStopping(
        monitor='val_mae',
        patience=20,
        mode='min',
        restore_best_weights=True
    )]
)

In [None]:

historydf = pd.DataFrame(history.history)

#Run this cell to plot the epoch vs loss graph
plt.figure(figsize=(10, 5))
plt.plot(historydf['mae'],label='mae')
plt.plot(historydf['val_mae'],label='val_mae')
plt.title('MAE vs. epochs')
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
plt.show() 

In [None]:
test_loss, test_acc, test_mae = model.evaluate(x=X_test, y=y_test, verbose=0)
print('Mean Absolute Error: {acc:0.3f}'.format(acc=test_mae))
print('accuracy: {acc:0.3f}'.format(acc=test_acc))
print('loss: {acc:0.3f}'.format(acc=test_loss))

# Classification Introduction:

Starting the classification process, and checking if we need to do more preprocessing steps to the current dataset.




In [None]:
df.info()

In [None]:
df.dropna(axis='rows', how='any', inplace=True)
df.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier

train_set, test_set = train_test_split(df, test_size=0.30, random_state=568)

#Train set -Int64Index: 38756 entries, 16778 to 41212
print(train_set.info())
# Test Set - Int64Index: 16611 entries, 45469 to 21122




## Reasons and Insights for selecting the feature list

I am dividing the feature list into 2 parts as we have already done the feature importance.

Why I am selecting features, as we have seen before most fires are based on the natural cause, i want to predict the  based on the climatic conditions

### List 1
--> X = "remoteness","latitude" ,"longitude","Temp_pre_7","Wind_pre_7","Hum_pre_7","Prec_pre_7"

### List 2
--> X = "Temp_pre_7","Wind_pre_7","Hum_pre_7","Prec_pre_7"

### Target variables
Y = "fire_size_class" || "fire_size"

## issues while selecting the targets  

When I was using "fire_size_class" for the Y feature, I had gotten continuous value error. After that using label encoder changed that unknown label error  was fixed and able to predict for the target  variable fire_size too.

## Reasons for selecting the target variable fire_size and fire_size_class

As our project end goal is to predict the fire_size, which will be useful for the people in real time to get to know the size of the fire and can be evacuated to the safe plce. But while working on the project, we felt fire_size_class is important as the fire_size that will give us the reason of the fire.

Also from the results of the PCA and Feature importance, we have finalised these target variables.

## Decision Tree Classifier


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

X = train_set[["remoteness","latitude" ,"longitude","Temp_pre_7","Wind_pre_7","Hum_pre_7","Prec_pre_7"]]
#X = train_set[["Temp_pre_7","Wind_pre_7","Hum_pre_7","Prec_pre_7"]]

Y = train_set["fire_size_class"]
# Y = train_set["fire_size"]

#Transforming the "unknown lable continuous variable "error into label encoder and applying for the classification

lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(Y)

tree_classifier = DecisionTreeClassifier().fit(X,y_transformed)
print(tree_classifier)

y_pred = tree_classifier.predict(X)
c_matrix = confusion_matrix(y_transformed, y_pred)
print('The confusion Matrix is: ')
print(c_matrix)


from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score
print("The Evalution metrics are as follows: ")
print("Accuracy: ", accuracy_score(y_transformed, y_pred))
print("Precision: ", precision_score(y_transformed, y_pred, average="weighted"))
print("Sensitivity: ", recall_score(y_transformed, y_pred, average="weighted"))
print("F1 Score: ", f1_score(y_transformed, y_pred, average="weighted"))


## Gaussian Naive Bayes


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score, confusion_matrix
from sklearn.metrics import confusion_matrix

# Training Set

X = train_set[["remoteness","latitude" ,"longitude","Temp_pre_7","Wind_pre_7","Hum_pre_7","Prec_pre_7"]]
#X = train_set[["Temp_pre_7","Wind_pre_7","Hum_pre_7","Prec_pre_7"]]
Y = train_set["fire_size_class"]
# Y = train_set["fire_size"]

gnb = GaussianNB()
print(gnb.fit(X,Y))

y_pred = gnb.predict(X)
c_matrix = confusion_matrix(Y, y_pred)
print("Confusion Metrix: ")
print(c_matrix)

print("The Evalution metrics are as follows: ")
print("Accuracy: ", accuracy_score(Y, y_pred))
print("Precision: ", precision_score(Y, y_pred, average="weighted"))
print("Sensitivity: ", recall_score(Y, y_pred, average="weighted"))
print("F1 Score: ", f1_score(Y, y_pred, average="weighted"))

## Random Forest classifier

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score, confusion_matrix
from sklearn.metrics import confusion_matrix

X = train_set[["remoteness","latitude" ,"longitude","Temp_pre_7","Wind_pre_7","Hum_pre_7","Prec_pre_7"]]
#X = train_set[["Temp_pre_7","Wind_pre_7","Hum_pre_7","Prec_pre_7"]]
Y = train_set["fire_size_class"]
# Y = train_set["fire_size"]

rfc = RandomForestClassifier()
print(rfc.fit(X,Y))

y_pred = rfc.predict(X)
c_matrix = confusion_matrix(Y, y_pred)
print("Confusion Metrix: ")
print(c_matrix)

print("The Evalution metrics are as follows: ")
print("Accuracy: ", accuracy_score(Y, y_pred))
print("Precision: ", precision_score(Y, y_pred, average="weighted"))
print("Sensitivity: ", recall_score(Y, y_pred, average="weighted"))
print("F1 Score: ", f1_score(Y, y_pred, average="weighted"))


#### Solved the issue with fire_size
A code sample for the fire size  as a target, Removed the "Unknown label continuous variable  error" as its not categorical data for the fire_size. 

Transformed that column using label encoder and transformed, so that we can apply the different classification function.

### Below are the same code for the Fire_size

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

X = train_set[["remoteness","latitude" ,"longitude","Temp_pre_7","Wind_pre_7","Hum_pre_7","Prec_pre_7"]]
#X = train_set[["Temp_pre_7","Wind_pre_7","Hum_pre_7","Prec_pre_7"]]

Y = train_set["fire_size"]

#Transforming the "unknown lable continuous variable "error into label encoder and applying for the classification

lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(Y)

tree_classifier = DecisionTreeClassifier().fit(X,y_transformed)
print(tree_classifier)

y_pred = tree_classifier.predict(X)
c_matrix = confusion_matrix(y_transformed, y_pred)
print('The confusion Matrix is: ')
print(c_matrix)


from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score
print("The Evalution metrics are as follows: ")
print("Accuracy: ", accuracy_score(y_transformed, y_pred))
print("Precision: ", precision_score(y_transformed, y_pred, average="weighted"))
print("Sensitivity: ", recall_score(y_transformed, y_pred, average="weighted"))
print("F1 Score: ", f1_score(y_transformed, y_pred, average="weighted"))