# Importing The Data & Doing Data Cleaning And Analysis

In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

#Import Models needed
from sklearn.ensemble import RandomForestRegressor

# Importing Evaluation Metrics & Tools
from sklearn.model_selection import train_test_split , cross_val_score

In [2]:
data = pd.read_csv("TrainAndValid.csv" , low_memory=False)
data['saledate'] = pd.to_datetime(data['saledate'])
data.head().T

Unnamed: 0,0,1,2,3,4
SalesID,1139246,1139248,1139249,1139251,1139253
SalePrice,66000.0,57000.0,10000.0,38500.0,11000.0
MachineID,999089,117657,434808,1026470,1057373
ModelID,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneerID,3.0,3.0,3.0,3.0,3.0
YearMade,2004,1996,2001,2001,2007
MachineHoursCurrentMeter,68.0,4640.0,2838.0,3486.0,722.0
UsageBand,Low,Low,High,High,Medium
saledate,2006-11-16 00:00:00,2004-03-26 00:00:00,2004-02-26 00:00:00,2011-05-19 00:00:00,2009-07-23 00:00:00


In [3]:
data.info();

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412698 entries, 0 to 412697
Data columns (total 53 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   SalesID                   412698 non-null  int64         
 1   SalePrice                 412698 non-null  float64       
 2   MachineID                 412698 non-null  int64         
 3   ModelID                   412698 non-null  int64         
 4   datasource                412698 non-null  int64         
 5   auctioneerID              392562 non-null  float64       
 6   YearMade                  412698 non-null  int64         
 7   MachineHoursCurrentMeter  147504 non-null  float64       
 8   UsageBand                 73670 non-null   object        
 9   saledate                  412698 non-null  datetime64[ns]
 10  fiModelDesc               412698 non-null  object        
 11  fiBaseModel               412698 non-null  object        
 12  fi

In [4]:
data.isna().sum()

SalesID                          0
SalePrice                        0
MachineID                        0
ModelID                          0
datasource                       0
auctioneerID                 20136
YearMade                         0
MachineHoursCurrentMeter    265194
UsageBand                   339028
saledate                         0
fiModelDesc                      0
fiBaseModel                      0
fiSecondaryDesc             140727
fiModelSeries               354031
fiModelDescriptor           337882
ProductSize                 216605
fiProductClassDesc               0
state                            0
ProductGroup                     0
ProductGroupDesc                 0
Drive_System                305611
Enclosure                      334
Forks                       214983
Pad_Type                    331602
Ride_Control                259970
Stick                       331602
Transmission                224691
Turbocharged                331602
Blade_Extension     

In [5]:
# Turn the Objects into Category data type
data = data.apply(lambda col: col.astype('category') if col.dtype == 'object' else col)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412698 entries, 0 to 412697
Data columns (total 53 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   SalesID                   412698 non-null  int64         
 1   SalePrice                 412698 non-null  float64       
 2   MachineID                 412698 non-null  int64         
 3   ModelID                   412698 non-null  int64         
 4   datasource                412698 non-null  int64         
 5   auctioneerID              392562 non-null  float64       
 6   YearMade                  412698 non-null  int64         
 7   MachineHoursCurrentMeter  147504 non-null  float64       
 8   UsageBand                 73670 non-null   category      
 9   saledate                  412698 non-null  datetime64[ns]
 10  fiModelDesc               412698 non-null  category      
 11  fiBaseModel               412698 non-null  category      
 12  fi

In [7]:
data["salesYear"] = data["saledate"][:4].astype(int)
data["salesMonth"] = data["saledate"][5:7].astype(int)
data["salesDay"] = data["saledate"][8:10].astype(int)
data.drop("saledate" , axis=1 , inplace=True)

In [9]:
data.head().T

Unnamed: 0,0,1,2,3,4
SalesID,1139246,1139248,1139249,1139251,1139253
SalePrice,66000.0,57000.0,10000.0,38500.0,11000.0
MachineID,999089,117657,434808,1026470,1057373
ModelID,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneerID,3.0,3.0,3.0,3.0,3.0
YearMade,2004,1996,2001,2001,2007
MachineHoursCurrentMeter,68.0,4640.0,2838.0,3486.0,722.0
UsageBand,Low,Low,High,High,Medium
fiModelDesc,521D,950FII,226,PC120-6E,S175


In [10]:
for col in data.select_dtypes(include='category').columns:
    data[col] = data[col].cat.codes + 1  # Add 1 so that -1 becomes 0

# Step 3: Fill missing values in categorical columns with the median of the column
for col in data.select_dtypes(include='category').columns:
    median_value = data[col].median()
    data[col].fillna(median_value, inplace=True)
    
# Step 4: Fill missing values in numeric columns (including float) with the mean of the column
for col in data.select_dtypes(include=[np.number, 'float']).columns:
    mean_value = data[col].mean()
    data.fillna({col : mean_value}, inplace=True)


In [11]:
data.shape

(412698, 55)

In [12]:
data.isna().sum()

SalesID                     0
SalePrice                   0
MachineID                   0
ModelID                     0
datasource                  0
auctioneerID                0
YearMade                    0
MachineHoursCurrentMeter    0
UsageBand                   0
fiModelDesc                 0
fiBaseModel                 0
fiSecondaryDesc             0
fiModelSeries               0
fiModelDescriptor           0
ProductSize                 0
fiProductClassDesc          0
state                       0
ProductGroup                0
ProductGroupDesc            0
Drive_System                0
Enclosure                   0
Forks                       0
Pad_Type                    0
Ride_Control                0
Stick                       0
Transmission                0
Turbocharged                0
Blade_Extension             0
Blade_Width                 0
Enclosure_Type              0
Engine_Horsepower           0
Hydraulics                  0
Pushblock                   0
Ripper    

In [13]:
# Remove The Sales Price Out Of The Data Set
X = data.drop("SalePrice" , axis = 1)
y = data["SalePrice"]

# Split the Data Into Train And Test Sets
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2)

In [14]:
# Instantiate The Model
model = RandomForestRegressor()

# Fit the Model
model.fit(X_train,y_train)

y_preds = model.predict(X_test)

# Score The Model's Performance
score = model.score(X_test,y_test)

In [15]:
# See The Initial Score Of the Model
score

0.834658377461367

## Using Cross validation Score To Get More Precise Accuracy

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Assuming X and y are your features and target variables
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Use cross_val_score with the R² scoring metric
scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2' , n_jobs=-1)

# Print the mean and standard deviation of the scores
print(f"R² Scores: {scores}")
print(f"Mean R² Score: {scores.mean()}")
print(f"Standard Deviation of R² Scores: {scores.std()}")

R² Scores: [0.75552398 0.76701457 0.68876044 0.67207015 0.64164044]
Mean R² Score: 0.7050019176362375
Standard Deviation of R² Scores: 0.04849950947093231


In [None]:
# Using MSE
mse_scores = -cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error' , n_jobs=-1)
print(f"Mean MSE: {mse_scores.mean()}")

In [None]:
# Using MSE
mse_scores = -cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error' , n_jobs=-1)
print(f"Mean MAE: {mse_scores.mean()}")

## HyperParameter Tuning Of The Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state=42))
])

param_grid = {
    'regressor__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'regressor__max_depth': [10, 20, 30],        # Maximum depth of the tree
    'regressor__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'regressor__min_samples_leaf': [1, 2, 4],    # Minimum number of samples required at a leaf node
    'regressor__max_features': ['auto', 'sqrt'], # Number of features to consider for the best split
    'regressor__bootstrap': [True, False]        # Whether bootstrap samples are used when building trees
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Parameters: ", best_params)