<a href="https://colab.research.google.com/github/Olavo-DataScientist/RainFallPrediction/blob/main/RainFallPrediction_gradiente_booster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
# Install necessary libraries (uncomment if needed)
# !pip install scikit-learn

# Import necessary libraries
import pandas as pd
import numpy as np
import sys
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [38]:
# Import Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

In [39]:
# Upload files manually if running in Google Colab
from google.colab import files

print("Please upload the train.csv file:")
train_file = files.upload()

print("Please upload the test.csv file:")
test_file = files.upload()

print("Please upload the sample_submission.csv file:")
submission_file = files.upload()

try:
    # Show a message and stop execution
    print("\nAll files uploaded successfully. Please proceed with the next steps.")
    sys.exit("Execution stopped. Files uploaded successfully.")
except SystemExit:
    # Handle the SystemExit exception gracefully
    print("Execution stopped normally after file upload. You can proceed with the next steps.")

Please upload the train.csv file:


Saving train.csv to train.csv
Please upload the test.csv file:


Saving test.csv to test.csv
Please upload the sample_submission.csv file:


Saving sample_submission.csv to sample_submission.csv

All files uploaded successfully. Please proceed with the next steps.
Execution stopped normally after file upload. You can proceed with the next steps.


In [41]:
# Load the CSV files into DataFrames
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [42]:
# Check for missing values in train data
print("\nMissing values in train data:")
print(train_df.isnull().sum())

# Check for missing values in test data
print("\nMissing values in test data:")
print(test_df.isnull().sum())


Missing values in train data:
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64

Missing values in test data:
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64


In [43]:
# Correct way to fill missing values without triggering FutureWarning
test_df['winddirection'] = test_df['winddirection'].fillna(test_df['winddirection'].mean())

In [44]:
# Prepare the data for training
features = train_df.drop(['id', 'day', 'rainfall'], axis=1)
target = train_df['rainfall']

In [45]:
# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [46]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features_scaled, target, test_size=0.3, random_state=42)

In [23]:
# Train a Random Forest Classifier
#model = RandomForestClassifier(n_estimators=100, random_state=42)
#model.fit(X_train, y_train)

In [47]:
# Train a Gradient Boosting Classifier
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

In [57]:
# Predict on validation set
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]

In [58]:
# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

print("Accuracy Score:", accuracy_score(y_val, y_pred))
print("ROC AUC Score:", roc_auc_score(y_val, y_prob))


Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.61      0.67       167
           1       0.88      0.93      0.90       490

    accuracy                           0.85       657
   macro avg       0.81      0.77      0.79       657
weighted avg       0.84      0.85      0.84       657

Accuracy Score: 0.84779299847793
ROC AUC Score: 0.8653794451912502


In [59]:
# Prepare test data for predictions
test_features = test_df.drop(['id', 'day'], axis=1)
test_features_scaled = scaler.transform(test_features)

# Predict probabilities for test data
test_probs = model.predict_proba(test_features_scaled)[:, 1]

In [62]:
# Create a new submission file based on the sample template
submission_df['rainfall'] = test_probs
submission_file = 'rainfall_predictions_submission - gradient booster.csv'  # Save in the current directory
submission_df.to_csv(submission_file, index=False)

print("\nSubmission file saved as 'rainfall_predictions_submission - gradient booster.csv.")


Submission file saved as 'rainfall_predictions_submission - gradient booster.csv.


In [63]:
# Recreate the submission file
submission_df['rainfall'] = test_probs
submission_file = 'rainfall_predictions_submission - gradient booster.csv'
submission_df.to_csv(submission_file, index=False)

# Load the generated submission file
submission_df = pd.read_csv(submission_file)

# Display information and first few rows of the submission file
submission_info = submission_df.info()
submission_head = submission_df.head()

(submission_info, submission_head)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        730 non-null    int64  
 1   rainfall  730 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 11.5 KB


(None,
      id  rainfall
 0  2190  0.980885
 1  2191  0.977524
 2  2192  0.946557
 3  2193  0.078626
 4  2194  0.030320)