In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = 'pacific.csv/pacific.csv'
data = pd.read_csv(file_path)

# Data Preprocessing
# Convert latitude and longitude to numeric (remove 'N', 'S', 'E', 'W')
data['Latitude'] = data['Latitude'].str.replace(r'[^\d.]', '', regex=True).astype(float)
data['Longitude'] = data['Longitude'].str.replace(r'[^\d.]', '', regex=True).astype(float)

In [2]:
# Handling missing values by filling with median values
# data.fillna(data.median(), inplace=True)

# Show the count of missing values and fill them with mean.
for column in data.columns:
    missing_cnt = data[column][data[column] == -999].count()
    print('Missing Values in column {col} = '.format(col = column) , missing_cnt )
    if missing_cnt!= 0:
#         print('in ' , column)
        mean = round(data[column][data[column] != -999 ].mean())
#         print("mean",mean)
        index = data.loc[data[column] == -999 , column].index
#         print("index" , index )
        data.loc[data[column] == -999 , column] = mean
#         print(df.loc[index , column])

Missing Values in column ID =  0
Missing Values in column Name =  0
Missing Values in column Date =  0
Missing Values in column Time =  0
Missing Values in column Event =  0
Missing Values in column Status =  0
Missing Values in column Latitude =  0
Missing Values in column Longitude =  0
Missing Values in column Maximum Wind =  0
Missing Values in column Minimum Pressure =  12804
Missing Values in column Low Wind NE =  19750
Missing Values in column Low Wind SE =  19750
Missing Values in column Low Wind SW =  19750
Missing Values in column Low Wind NW =  19750
Missing Values in column Moderate Wind NE =  19750
Missing Values in column Moderate Wind SE =  19750
Missing Values in column Moderate Wind SW =  19750
Missing Values in column Moderate Wind NW =  19750
Missing Values in column High Wind NE =  19750
Missing Values in column High Wind SE =  19750
Missing Values in column High Wind SW =  19750
Missing Values in column High Wind NW =  19750


In [3]:
# Encode categorical variables like 'Event' and 'Status'
label_encoder = LabelEncoder()
data['Event'] = label_encoder.fit_transform(data['Event'])
data['Status'] = label_encoder.fit_transform(data['Status'])


# Feature Selection: Choose relevant features for prediction
features = data[['Latitude', 'Longitude', 'Minimum Pressure', 'Low Wind NE', 'Low Wind SE', 'Low Wind SW', 'Low Wind NW', 'Event', 'Status']]
target = data['Maximum Wind']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [9]:
print("X train", X_train)
print("X test", X_test)
print("y train", y_train)
print("y test", y_test)

X train        Latitude  Longitude  Minimum Pressure  Low Wind NE  Low Wind SE  \
4634       20.6      105.4               995           40           35   
15273      16.2      103.4               985           40           35   
4475       16.2      114.5               995           40           35   
24451      23.6      108.1               989           40           40   
2546       14.5      109.2               995           40           35   
...         ...        ...               ...          ...          ...   
21575      18.0      130.3              1011            0            0   
5390       22.0      123.0               995           40           35   
860        21.3      121.2               995           40           35   
15795      16.3      166.3               995           40           35   
23654      15.6      135.4              1004           60            0   

       Low Wind SW  Low Wind NW  Event  Status  
4634            27           35      0       1  
15273

In [5]:
                                                                # Train using random forest
from sklearn.ensemble import RandomForestClassifier
# Here instead of cross validation we will be using oob score as a measure of accuracy.
# I will hyper tuning the parameter: No of Trees.

trees  = [10, 20 , 50, 100,200,500,1000,1200]
maxn_five = {}
maxn = {}
for i in trees:
    rf = RandomForestClassifier(n_estimators=i , oob_score=True)
    rf.fit(X_train , y_train)
    print('Obb Score for {x} trees: and taking top five features '.format(x = i) , rf.oob_score_)
    maxn_five[i] = rf.oob_score_
    rf.fit(X_train , y_train)
    print('Obb Score for {x} trees: and taking all the features '.format(x = i) , rf.oob_score_)
    maxn[i] = rf.oob_score_

  warn(


Obb Score for 10 trees: and taking top five features  0.510593524319671


  warn(


Obb Score for 10 trees: and taking all the features  0.5077239466258548


  warn(


Obb Score for 20 trees: and taking top five features  0.5386197331292745


  warn(


Obb Score for 20 trees: and taking all the features  0.536706681333397
Obb Score for 50 trees: and taking top five features  0.5517719642259314
Obb Score for 50 trees: and taking all the features  0.5534458845473241
Obb Score for 100 trees: and taking top five features  0.5569850303696973
Obb Score for 100 trees: and taking all the features  0.5555502415227892
Obb Score for 200 trees: and taking top five features  0.5614328757951121
Obb Score for 200 trees: and taking all the features  0.5606676550767612
Obb Score for 500 trees: and taking top five features  0.5624372279879478
Obb Score for 500 trees: and taking all the features  0.5612415706155244
Obb Score for 1000 trees: and taking top five features  0.5601893921277918
Obb Score for 1000 trees: and taking all the features  0.5621502702185661
Obb Score for 1200 trees: and taking top five features  0.5617676598593907
Obb Score for 1200 trees: and taking all the features  0.5611459180257306


In [10]:
                                                    # Trained using RandomForestClassifier()
# Split the data into training and testing.
# this is done by using train_test_split() funciton
x_trains , x_tests , y_trains, y_tests  = train_test_split(X_train, y_train, test_size=0.33, random_state=42)
# Set n to the feature of maximum oob score.

n = 0
for i in maxn_five:
    if max(maxn_five.values()) == maxn_five[i]:
        n= i
        
# Set n_estimators to n.
rf = RandomForestClassifier(oob_score=True , n_estimators=n)
rf.fit(x_trains , y_trains)
y_pred_rf = rf.predict(x_tests[features.index[:5]])
scores_rf = {'accuracy': accuracy_score(y_tests , y_pred_rf) ,'recall' : recall_score(y_tests , y_pred_rf , average='weighted') ,'precision' : precision_score(y_tests , y_pred_rf , average='weighted') }
print('Scores for Random Forest with n = ' , n , ' and using features ',  features.index[:5] , ' are : ')
print('Accuracy: ' , scores_rf['accuracy'])
print('Recall: ' , scores_rf['recall'])
print('Precision: ' , scores_rf['precision'])

KeyError: 'None of [RangeIndex(start=0, stop=5, step=1)] are in the [columns]'

In [6]:
# Model Selection: Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")

Mean Squared Error: 53.20038787427516


In [7]:
# Model Performance: You can also evaluate using other metrics like R-squared, MAE, etc.

from sklearn.metrics import r2_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print("R squared is a accuracy for regression model.")
# print("It indicates how well the model's predictions fit the actual data\n \
#             R² = 1: Perfect prediction.\n \
#             R² = 0: The model doesn't explain any variance (as good as a simple mean predictor).\n \
#             Negative R²: Indicates that the model performs worse than predicting the mean of the target.\n")

print(f"R-squared: {r2}")

R squared is a accuracy for regression model.
R-squared: 0.9150228276576952
