In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('machine.data')

In [3]:
df

Unnamed: 0,vendor_name,Model,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...,...
204,sperry,80/8,124,1000,8000,0,1,8,42,37
205,sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
206,sratus,32,125,2000,8000,0,2,14,52,41
207,wang,vs-100,480,512,8000,32,0,0,67,47


In [4]:
df.drop(['Model'], axis=1, inplace=True)

In [5]:
df.columns

Index(['vendor_name', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP',
       'ERP'],
      dtype='object')

In [6]:
Y = df['ERP'].copy()
X = df.drop(['ERP'],axis=1, inplace=False).copy()

In [7]:
num_cols = X.columns[X.dtypes != 'object']
cat_cols = X.columns[X.dtypes == 'object']

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('oneHot', OneHotEncoder(sparse_output=False))
])

preprocessing_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)
])

In [9]:
X_processed = preprocessing_pipeline.fit_transform(X)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, Y, test_size=0.2)

In [11]:
X_train

array([[ 0.48597002, -0.48275071, -0.75190585, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.91102261, -0.67502598, -0.83738706, ...,  0.        ,
         0.        ,  0.        ],
       [-0.43068543, -0.2243162 , -0.32449978, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.67332952,  1.32629082,  1.72704937, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.37042522, -0.69156579, -0.94269992, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.29617196, -0.67502598, -0.32449978, ...,  0.        ,
         0.        ,  0.        ]])

In [12]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

linreg.fit(X_train, y_train)

In [13]:
preds = linreg.predict(X_test)

In [14]:
from sklearn.metrics import mean_squared_error

print("mse : ", mean_squared_error(y_test, preds))

mse :  776.7946229957657


In [20]:
from sklearn.ensemble import RandomForestRegressor

ref = RandomForestRegressor(n_estimators=1000)

ref.fit(X_train, y_train)

In [21]:
preds = ref.predict(X_test)
print("mse : ", mean_squared_error(y_test, preds))

mse :  572.3440957380952
