# PREDICTING THE ESTIMATED RELATIVE PERFORMANCE OF A COMPUTER USING THE MULTIPLE LINEAR REGRESSION MACHINE MODEL

DATASET SOURCE : "UCI-MACHINE LEARNING REPOSITRY"

## IMPORTING THE LIBRARIES 

In [94]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## IMPORTING THE DATASET

In [73]:
names = ['VENDOR','MODEL_NAME','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ];
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data',names=names)
data.head(10)

Unnamed: 0,VENDOR,MODEL_NAME,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132
5,amdahl,470v/b,26,8000,32000,64,8,32,318,290
6,amdahl,580-5840,23,16000,32000,64,16,32,367,381
7,amdahl,580-5850,23,16000,32000,64,16,32,489,381
8,amdahl,580-5860,23,16000,64000,64,16,32,636,749
9,amdahl,580-5880,23,32000,64000,128,32,64,1144,1238


 Attribute Information:
   1. vendor name: 30 
      (adviser, amdahl,apollo, basf, bti, burroughs, c.r.d, cambex, cdc, dec, 
       dg, formation, four-phase, gould, honeywell, hp, ibm, ipl, magnuson, 
       microdata, nas, ncr, nixdorf, perkin-elmer, prime, siemens, sperry, 
       sratus, wang)
   2. Model Name: many unique symbols
   3. MYCT: machine cycle time in nanoseconds (integer)
   4. MMIN: minimum main memory in kilobytes (integer)
   5. MMAX: maximum main memory in kilobytes (integer)
   6. CACH: cache memory in kilobytes (integer)
   7. CHMIN: minimum channels in units (integer)
   8. CHMAX: maximum channels in units (integer)
   9. PRP: published relative performance (integer)
  10. ERP: estimated relative performance from the original article (integer)

## SPLITTING THE DATSET TO DEPENDENT AND INDEPENDENT 

## REMOVING THE UNWANTED COLUMNS WHICH HAVE NO SIGNIFICANCE 
## MODEL_NAME is UNIQUE for each computer hence it doesn't help the model in learning 


In [74]:
X = data.iloc[:,[0,2,3,4,5,6,7,8]].values
y = data.iloc[:, 9].values

#WE HAVE NEGLECTED THE INDEX 1 WHICH IS ASSOCIATED WITH THE MODEL NAME

# CATEGORICAL VARIABLES HANDLING

In [75]:
X

array([['adviser', 125, 256, ..., 16, 128, 198],
       ['amdahl', 29, 8000, ..., 8, 32, 269],
       ['amdahl', 29, 8000, ..., 8, 32, 220],
       ...,
       ['sratus', 125, 2000, ..., 2, 14, 52],
       ['wang', 480, 512, ..., 0, 0, 67],
       ['wang', 480, 1000, ..., 0, 0, 45]], dtype=object)

In [76]:
X[:,0]

array(['adviser', 'amdahl', 'amdahl', 'amdahl', 'amdahl', 'amdahl',
       'amdahl', 'amdahl', 'amdahl', 'amdahl', 'apollo', 'apollo', 'basf',
       'basf', 'bti', 'bti', 'burroughs', 'burroughs', 'burroughs',
       'burroughs', 'burroughs', 'burroughs', 'burroughs', 'burroughs',
       'c.r.d', 'c.r.d', 'c.r.d', 'c.r.d', 'c.r.d', 'c.r.d', 'cdc', 'cdc',
       'cdc', 'cdc', 'cdc', 'cdc', 'cdc', 'cdc', 'cdc', 'cambex',
       'cambex', 'cambex', 'cambex', 'cambex', 'dec', 'dec', 'dec', 'dec',
       'dec', 'dec', 'dg', 'dg', 'dg', 'dg', 'dg', 'dg', 'dg',
       'formation', 'formation', 'formation', 'formation', 'formation',
       'four-phase', 'gould', 'gould', 'gould', 'hp', 'hp', 'hp', 'hp',
       'hp', 'hp', 'hp', 'harris', 'harris', 'harris', 'harris', 'harris',
       'harris', 'harris', 'honeywell', 'honeywell', 'honeywell',
       'honeywell', 'honeywell', 'honeywell', 'honeywell', 'honeywell',
       'honeywell', 'honeywell', 'honeywell', 'honeywell', 'honeywell',
       'i

# DUMMY VARIABLE ENCODING

In [77]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
enc=LabelEncoder()
X[:,0]=enc.fit_transform(X[:,0])
onenc=OneHotEncoder(categorical_features=[0])
X=onenc.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [78]:
X

array([[  1.,   0.,   0., ...,  16., 128., 198.],
       [  0.,   1.,   0., ...,   8.,  32., 269.],
       [  0.,   1.,   0., ...,   8.,  32., 220.],
       ...,
       [  0.,   0.,   0., ...,   2.,  14.,  52.],
       [  0.,   0.,   0., ...,   0.,   0.,  67.],
       [  0.,   0.,   0., ...,   0.,   0.,  45.]])

# AVOIDNIG THE DUMMY VARIABLE TRAP

In [79]:
X=X[:,1:]

#IF WE HAVE N DUMMY VARIABLE THE WE HAVE TO INCLUDE N-1 VARIABLES ONLY IN THE MODEL 

# Splitting the dataset into the Training set and Test set

In [80]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

## FEATURE SCALING

In [81]:
from sklearn.preprocessing import StandardScaler
sc_x=StandardScaler()

X_train=sc_x.fit_transform(X_train)
X_test=sc_x.transform(X_test)

#THIS STEP CAN BE SKIPPED AS THE REGRESSOR IS CAPABLE OF DOING THE FEATUR SCALING BY ITSELF

# Model creation

In [82]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression()
reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

# Model predicton

In [83]:
#test set results
Y_pred=reg.predict(X_test)

## ACCURACY OF THE MODEL

In [95]:
accuracy1=reg.score(X_test,y_test)

In [97]:
print(accuracy1*100,'%')

92.02717404590773 %


# WE HAVE CREATEDA MODEL WHICH CAN PREDICT THE ESTIMATED RELATIVE PERFOMANCE OF A COMPUTER WITH A 92.02% ACCURACY