In [16]:

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('weather.csv')

# Convert categorical features to numerical
df['RainTomorrow'] = df['RainTomorrow'].map({'No': 0, 'Yes': 1})
df['RainToday'] = df['RainToday'].map({'No': 0, 'Yes': 1})

# Identify the categorical columns you want to one-hot encode
categorical_cols = ['WindGustDir', 'WindDir9am', 'WindDir3pm']

# Create a OneHotEncoder object
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit and transform the categorical columns
encoded_data = encoder.fit_transform(df[categorical_cols])

# Get the feature names for the new encoded columns
feature_names = encoder.get_feature_names_out(categorical_cols)

# Create a new DataFrame with the one-hot encoded data
encoded_df = pd.DataFrame(encoded_data, columns=feature_names)

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([df.drop(categorical_cols, axis=1), encoded_df], axis=1)

# Separate features (X) and target (y)
X = df_encoded.drop(columns = ['RainTomorrow'])# all columns except the rain tmr
y = df_encoded['RainTomorrow']   # last


# Handle missing values
X = X.dropna()
y = y.loc[X.index] # y only has rows corresponding to row indices of X


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


model = RandomForestClassifier()
model.fit(X_train, y_train)



y_pred = model.predict(X_test)
print('y_pred = ' , y_pred)
print('y_test = ' , y_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Get feature importances
importances = model.feature_importances_

# Create a DataFrame to store feature names and their importances
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})

# Sort the DataFrame by importance in descending order
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Print the feature importances
print("\nFeature Importances:")
print(feature_importances)
     

y_pred =  [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
y_test =  227    0
42     0
296    0
184    0
56     1
      ..
240    0
198    0
259    0
361    0
303    1
Name: RainTomorrow, Length: 107, dtype: int64
Accuracy: 0.9906542056074766
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        90
           1       1.00      0.94      0.97        17

    accuracy                           0.99       107
   macro avg       0.99      0.97      0.98       107
weighted avg       0.99      0.99      0.99       107


Feature Importances:
            Feature  Importance
17          RISK_MM    0.391716
11      Pressure3pm    0.079719
9       Humidity3pm    0.053416
10      Pressure9am    0.051078
4          Sunshine    0.045993
..              ...         ...
63   WindDir3pm