In [25]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('weather.csv')

# Convert categorical features to numerical
df['RainTomorrow'] = df['RainTomorrow'].map({'No': 0, 'Yes': 1})
df['RainToday'] = df['RainToday'].map({'No': 0, 'Yes': 1})

# Identify the categorical columns you want to one-hot encode
categorical_cols = ['WindGustDir', 'WindDir9am', 'WindDir3pm']

# Create a OneHotEncoder object
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit and transform the categorical columns
encoded_data = encoder.fit_transform(df[categorical_cols])

# Get the feature names for the new encoded columns
feature_names = encoder.get_feature_names_out(categorical_cols)

# Create a new DataFrame with the one-hot encoded data
encoded_df = pd.DataFrame(encoded_data, columns=feature_names)

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([df.drop(categorical_cols, axis=1), encoded_df], axis=1)

# Separate features (X) and target (y)
X = df_encoded.drop(columns = ['Temp3pm'])# all columns except the rain tmr
y = df_encoded['Temp3pm'] # last

# Handle missing values
X = X.dropna()
y = y.loc[X.index] # y only has rows corresponding to row indices of X

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('y_pred = ' , y_pred)
print('y_test = ' , y_test)

# Evaluate the regression model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Get feature importances
importances = model.feature_importances_
# Create a DataFrame to store feature names and their importances
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
# Sort the DataFrame by importance in descending order
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
# Print the feature importances
print("\nFeature Importances:")
print(feature_importances)

y_pred =  [13.189 18.629 10.628 15.958 24.172  7.427 30.867 26.64  23.87  22.565
 20.795 17.444 26.889 20.204 20.644 18.684 13.387 17.228 20.496  9.787
 26.493  7.822 11.777 25.    21.717 11.126  8.759 19.728 25.24  32.449
 13.238 21.669 27.096 16.82  18.652 23.666 17.005 14.961 28.29  33.453
 12.383 18.512 14.596 10.953 17.317 20.484 24.852 15.594 20.119 14.66
  8.468 10.932  7.747 18.616 19.226  9.591 13.916 18.355 32.569  7.615
 21.578 20.346 10.789 24.25  33.765 26.611 26.052 13.147  7.59  28.13
 16.796 25.287 17.574 19.286 15.997 32.02  10.514 19.471 30.66  31.918
 13.042 15.546 16.378 21.849 18.726 13.211 14.485 18.587 22.34  10.928
 32.215 24.548 16.773 24.367 16.154 28.723 28.566 18.891 26.714 21.547
 17.075 26.784 14.142  9.617 11.002 29.639 14.932]
y_test =  227    13.8
42     19.2
296    11.1
184    16.2
56     19.0
       ... 
240    14.5
198     7.3
259    11.0
361    30.0
303    15.4
Name: Temp3pm, Length: 107, dtype: float64
Mean Squared Error: 1.6586420467289744
R-squar