# **Making Predictions with Validation and Test Data**

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd

valid_data_csv = "/content/drive/MyDrive/Supervised Learning/Buldozers Project/Valid.csv"
test_data_csv = "/content/drive/MyDrive/Supervised Learning/Buldozers Project/Test.csv"

valid_data = pd.read_csv(valid_data_csv)
test_data = pd.read_csv(test_data_csv)

In [None]:
valid_data.head(3)

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,fiModelDesc,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1222837,902859,1376,121,3,1000,0.0,,1/5/2012 0:00,375L,...,None or Unspecified,None or Unspecified,None or Unspecified,None or Unspecified,Double,,,,,
1,1222839,1048320,36526,121,3,2006,4412.0,Medium,1/5/2012 0:00,TX300LC2,...,None or Unspecified,"12' 4""",None or Unspecified,Yes,Double,,,,,
2,1222841,999308,4587,121,3,2000,10127.0,Medium,1/5/2012 0:00,270LC,...,None or Unspecified,"12' 4""",None or Unspecified,None or Unspecified,Double,,,,,


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install category_encoders



In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import category_encoders as ce

# **PROCESSING DATASET TO FEED INTO MODEL**

In [None]:

def clean_data(csv_path, target_col="MachineHoursCurrentMeter"):
    # Load data
    data = pd.read_csv(csv_path)

    drop_cols = ["SalesID","MachineID","ModelID","YearMade","datasource","auctioneerID"]
    data = data.drop([col for col in drop_cols if col in data.columns], axis=1)
    object_cols = []
    numerical_cols = []

    for col in data.columns:
        if data[col].dtype == "object":
            object_cols.append(col)
        else:
            numerical_cols.append(col)
    cat_imputer = SimpleImputer(strategy='most_frequent')
    num_imputer = SimpleImputer(strategy="median")

    for col in object_cols:
        data[col] = cat_imputer.fit_transform(data[[col]]).ravel()

    for col in numerical_cols:
        data[col] = num_imputer.fit_transform(data[[col]]).ravel()
    scaler = StandardScaler()

    for col in object_cols:
        unique_vals = data[col].nunique()

        if unique_vals < 5:
            le = LabelEncoder()
            le.fit(data[col])
            data[col] = le.transform(data[col])

        else:
            target_encoder = ce.TargetEncoder(cols=[col])
            if target_col in data.columns:
                target_encoder.fit(data[col], data[target_col])
            else:
                target_encoder.fit(data[col], pd.Series([0]*len(data)))

            data[col] = target_encoder.transform(data[col])

    num_cols = data.select_dtypes(include=['float64', 'int64']).columns
    data[num_cols] = scaler.fit_transform(data[num_cols])
    return data


In [None]:
valid_cleaned = clean_data(valid_data_csv)
test_cleaned = clean_data(test_data_csv)

In [None]:
valid_cleaned

Unnamed: 0,MachineHoursCurrentMeter,UsageBand,saledate,fiModelDesc,fiBaseModel,fiSecondaryDesc,fiModelSeries,fiModelDescriptor,ProductSize,fiProductClassDesc,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,-1.073571,0.451044,-0.326595,0.199106,0.095109,-0.387980,-0.026531,-0.047189,1.316954,0.935257,...,0.03108,0.015462,0.305984,-0.179718,-0.252766,0.0,-0.067291,-0.178901,0.069444,0.068512
1,-0.016425,0.451044,-0.326595,-0.137983,-0.199345,0.436172,-6.635533,-0.047189,1.316954,0.795459,...,0.03108,1.935674,0.305984,5.472122,-0.252766,0.0,-0.067291,-0.178901,0.069444,0.068512
2,1.352929,0.451044,-0.326595,0.651154,0.093742,-0.387980,-0.026531,1.258110,1.316954,0.583445,...,0.03108,1.935674,0.305984,-0.179718,-0.252766,0.0,-0.067291,-0.178901,0.069444,0.068512
3,0.048269,-1.498876,-0.326595,0.029128,0.990237,-0.393246,-0.026531,1.258110,1.316954,0.795459,...,0.03108,0.015462,0.305984,-0.179718,-0.252766,0.0,-0.067291,-0.178901,0.069444,0.068512
4,0.879226,0.451044,0.049098,1.600400,0.762247,1.105192,-0.026531,-0.047189,-0.133131,0.656780,...,0.03108,0.015462,0.305984,-0.179718,-0.252766,0.0,-0.067291,-0.178901,0.069444,0.068512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11568,-0.166419,0.451044,1.030593,-0.179332,-0.350184,-0.675909,-0.026531,-0.047189,-2.001276,-1.101171,...,0.03108,0.015462,0.305984,-0.179718,-0.252766,0.0,-0.067291,-0.178901,0.069444,0.068512
11569,-0.166419,0.451044,-0.741543,-0.093597,-0.350184,-0.675909,-6.635533,-0.047189,-2.001276,-1.048671,...,0.03108,0.015462,0.305984,-0.179718,-0.252766,0.0,-0.067291,-0.178901,0.069444,0.068512
11570,-0.166419,0.451044,-0.741543,-0.179332,-0.350184,-0.675909,-0.026531,-0.047189,-2.001276,-1.101171,...,0.03108,0.015462,0.305984,-0.179718,-0.252766,0.0,-0.067291,-0.178901,0.069444,0.068512
11571,-0.166419,0.451044,1.030593,-0.179332,-0.350184,-0.675909,-0.026531,-0.047189,-2.001276,-1.101171,...,0.03108,0.015462,0.305984,-0.179718,-0.252766,0.0,-0.067291,-0.178901,0.069444,0.068512


In [None]:
test_cleaned.head()

Unnamed: 0,MachineHoursCurrentMeter,UsageBand,saledate,fiModelDesc,fiBaseModel,fiSecondaryDesc,fiModelSeries,fiModelDescriptor,ProductSize,fiProductClassDesc,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,-0.009443,-2.618393,-0.024715,0.018511,-0.07353,1.130796,0.345953,0.465464,0.683083,-0.124167,...,0.190398,0.191287,0.262716,-0.215738,-0.235203,0.0,0.313563,0.191328,0.073018,0.051842
1,0.079963,-5.517196,-0.034748,0.04666,0.007643,1.130796,0.345953,0.465464,0.683083,-0.087788,...,0.190398,0.191287,0.262716,-0.215738,-0.235203,0.0,0.313563,0.191328,0.073018,0.051842
2,-0.000994,0.280409,-0.034748,0.015766,-0.026783,-0.921342,0.345953,-2.036434,-1.26325,-0.106552,...,0.190398,-6.423196,-2.027457,-0.215738,-0.235203,0.0,0.313563,0.191328,0.073018,0.051842
3,0.00944,-5.517196,-0.034748,-0.047722,-0.041409,-0.890659,0.345953,0.465464,-1.26325,-0.090047,...,0.190398,0.191287,-2.027457,4.635252,4.251655,0.0,0.313563,0.191328,0.073018,0.051842
4,-0.014484,-2.618393,-0.034748,-0.00505,-0.071069,-1.088727,0.345953,0.465464,0.683083,-0.128635,...,0.190398,0.191287,0.262716,-0.215738,-0.235203,0.0,0.313563,0.191328,0.073018,0.051842


# **PREDICTIONS USING PRETRAINED MODEL**

In [None]:
import pickle
with open('/content/drive/MyDrive/Supervised Learning/Buldozers Project/model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [None]:
valid_data_predictions = loaded_model.predict(valid_cleaned[:5])

In [None]:
print(f"Predctions of first 5 Elements of Valid Data set are:")
for number in valid_data_predictions:
    print(number)

Predctions of first 5 Elements of Valid Data set are:
63759.21
63759.21
63759.21
63759.21
60910.227


In [None]:
test_data_predictions = loaded_model.predict(test_cleaned[:5])

In [None]:
print(f"Predctions of first 5 Elements of Test Data set are:")
for number in test_data_predictions:
    print(number)

Predctions of first 5 Elements of Test Data set are:
59347.766
59852.32
63759.21
60860.38
63759.21
