In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import  DecisionTreeRegressor


In [6]:
#data train
file_train='../BlogFeedBack_Dataset/Train/blogData_train.csv'
train=pd.read_csv(file_train, header=None)

#data test
file_test='../BlogFeedBack_Dataset/Train/blogData_train.csv'
test=pd.read_csv(file_test, header=None)


In [7]:
train.shape, test.shape

((52397, 281), (7625, 281))

In [8]:
cols_to_remove = list(range(0, 50)) + list(range(55, 60)) + list(range(276, 280))
train = train.drop(train.columns[cols_to_remove], axis=1)  # Drop the first 50 columns
test = test.drop(test.columns[cols_to_remove], axis=1)  # Drop the first 50 columns

# Split features and labels
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

In [10]:
# Print shapes to confirm
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (52397, 221)
y_train shape: (52397,)
X_test shape: (7625, 221)
y_test shape: (7625,)


In [11]:
class RandomForestRegressor():
    """Implement Random Forest regressor from scratch using Decision Tree."""
    
    def __init__(
        self,
        n_estimators=100,
        criterion='mse', 
        max_depth=None,
        min_samples_leaf=1,
        max_features='sqrt', 
        min_impurity_decrease=0.0,
        random_state=0
    ):
        """
        Some important parameters in Random Forest.
        
        Args:
            n_estimators (int): The number of trees in the forest.
            criterion (str): The function to measure the quality of a split ('mse' for mean squared error).
            max_depth (int): The maximum depth of the tree.
            min_samples_leaf (int): The minimum number of samples required to be at a leaf node.
            max_features (str): The number of features to consider when looking for the best split; 'sqrt' for square root.
            min_impurity_decrease (float): A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
            random_state (int): Controls randomness of the bootstrap samples and the features.
        """
        self.n_estimators = n_estimators
        self.criterion =  criterion
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.min_impurity_decrease = min_impurity_decrease
        self.random_state = random_state
        
    def fit(self, X, y):
        """Fit the random forest model."""
        self.n_samples, self.n_features = X.shape
        if self.max_features == 'sqrt':
            self.max_feature = int(np.sqrt(self.n_features))
        
        self.trees = []
        for i in range(self.n_estimators):
            X_train, _, y_train, _ = train_test_split(
                X, 
                y, 
                test_size=0.3, 
                random_state=self.random_state + i
            )
            tree = DecisionTreeRegressor(
                criterion = self.criterion,
                max_depth = self.max_depth,
                min_samples_leaf = self.min_samples_leaf,
                max_features = self.max_features,
                random_state = self.random_state
            )
            tree.fit(X_train, y_train)
            self.trees.append(tree)
    
    def predict(self, X_test):
        """Predict continuous values for X_test."""
        predictions = np.array([tree.predict(X_test) for tree in self.trees])
        predicted_values = np.mean(predictions, axis=0)
        return predicted_values


In [12]:
# Initialize and fit model
clf = RandomForestRegressor(
    n_estimators=100,
    criterion='squared_error', 
    max_depth=32,
    min_samples_leaf=4,
    max_features='sqrt', 
    min_impurity_decrease=0.0,
    random_state=0
)

In [13]:
clf.fit(X_train, y_train)

In [14]:
y_pred=clf.predict(X_test)

In [15]:
# Print the sum of NaN values to confirm their presence
print("NaN values in y_test before:", y_test.isna().sum())

# Replace NaN with 1
y_test.fillna(1, inplace=True)

# Verify the replacement
print("NaN values in y_test after:", y_test.isna().sum())

print(y_pred)


NaN values in y_test before: 1
NaN values in y_test after: 0
[17.09765441  0.67844612  5.97938903 ...  2.41585836  3.54634997
  0.15260814]


In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  # Root Mean Squared Error
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print(f'RMSE : {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 627.7884973764155
RMSE : 25.05570788017005
Mean Absolute Error (MAE): 5.702624225119574


In [20]:
import joblib

joblib.dump(clf,'random_forest_regressor.joblib')

['random_forest_regressor.joblib']