In [2]:
import pandas as pd
import os
os.chdir("F:\PML\Datasets")
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [3]:
boston = pd.read_csv("Boston.csv")
boston.shape

(506, 14)

In [4]:
train, test = train_test_split(boston, test_size=0.3,random_state=23)
train.shape, test.shape

((354, 14), (152, 14))

In [5]:
X_train = train.drop('medv', axis=1)
y_train = train['medv']
X_test = test.drop('medv', axis=1)
y_test = test['medv']

In [6]:
train.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
481,5.70818,0.0,18.1,0,0.532,6.75,74.9,3.3317,24,666,20.2,393.07,7.74,23.7
97,0.12083,0.0,2.89,0,0.445,8.069,76.0,3.4952,2,276,18.0,396.9,4.21,38.7
51,0.04337,21.0,5.64,0,0.439,6.115,63.0,6.8147,4,243,16.8,393.97,9.43,20.5
293,0.08265,0.0,13.92,0,0.437,6.127,18.4,5.5027,4,289,16.0,396.9,8.58,23.9
27,0.95577,0.0,8.14,0,0.538,6.047,88.8,4.4534,4,307,21.0,306.38,17.28,14.8


In [7]:
test.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
176,0.07022,0.0,4.05,0,0.51,6.02,47.2,3.5549,5,296,16.6,393.23,10.11,23.2
311,0.79041,0.0,9.9,0,0.544,6.122,52.8,2.6403,4,304,18.4,396.9,5.98,22.1
94,0.04294,28.0,15.04,0,0.464,6.249,77.3,3.615,4,270,18.2,396.9,10.59,20.6
139,0.54452,0.0,21.89,0,0.624,6.151,97.9,1.6687,4,437,21.2,396.9,18.46,17.8
232,0.57529,0.0,6.2,0,0.507,8.337,73.3,3.8384,8,307,17.4,385.91,2.47,41.7


degree = 1

In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)
ycap = lr.predict(X_test)
print(r2_score(y_test, ycap))

0.6947991644651352


degree = 2

In [9]:
poly = PolynomialFeatures(degree=2)
X_poly_trn = poly.fit_transform(X_train)
X_poly_trn.shape

(354, 105)

In [10]:
lr.fit(X_poly_trn, y_train)
X_poly_tst = poly.transform(X_test)
ycap = lr.predict(X_poly_tst)
print(r2_score(y_test, ycap))

0.00455531305791923


### Using pipeline

In [11]:
from sklearn.pipeline import Pipeline


In [12]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('Poly',poly),('LR',lr)])
pipe.fit(X_train,y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

0.00455531305791923


In [13]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('Poly',poly),('LR',lr)])
pipe.fit(X_train,y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-1821.658003099728


### Housing dataset

In [14]:
housing = pd.read_csv("Housing.csv")
dum_house = pd.get_dummies(housing, drop_first=True)
X = dum_house.drop('price', axis=1)
y = dum_house['price']

In [15]:
train, test = train_test_split(dum_house, test_size=0.3, random_state=23)
train.shape, test.shape

((382, 12), (164, 12))

In [16]:
X_train = train.drop('price', axis=1)
y_train = train['price']
X_test = test.drop('price', axis=1)
y_test = test['price']

#### degree1

In [17]:
poly = PolynomialFeatures(degree=1)
lr = LinearRegression()
lr.fit(X_train, y_train)
ycap = lr.predict(X_test)
print(r2_score(y_test, ycap))

0.6543071090954233


#### degree2

In [18]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('Poly',poly),('LR',lr)])
pipe.fit(X_train,y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

0.4634636630848509


#### degree3

In [19]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('Poly',poly),('LR',lr)])
pipe.fit(X_train,y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-5.352084238213048


### Concrete dataset

In [20]:
concrete = pd.read_csv("F:/PML/Cases/Concrete_Strength/Concrete_Data.csv")
concrete.head()

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [21]:
concrete.columns

Index(['Cement', 'Blast', 'Fly', 'Water', 'Superplasticizer', 'Coarse', 'Fine',
       'Age', 'Strength'],
      dtype='object')

In [22]:
X = concrete.drop('Strength',axis=1)
y = concrete['Strength']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=23)
train.shape, test.shape

((382, 12), (164, 12))

In [24]:
poly = PolynomialFeatures(degree=1)
lr = LinearRegression()
lr.fit(X_train, y_train)
ycap = lr.predict(X_test)
print(r2_score(y_test, ycap))

0.6312960386440598


In [25]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('Poly',poly),('LR',lr)])
pipe.fit(X_train,y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

0.7855626359926299


In [26]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('Poly',poly),('LR',lr)])
pipe.fit(X_train,y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

0.8368847587605153


### Test concrete dataset

In [27]:
test = pd.read_csv(r"F:/PML/Cases/Concrete_Strength/testConcrete.csv")
test.columns

Index(['Cement', 'Blast', 'Fly', 'Water', 'Superplasticizer', 'Coarse', 'Fine',
       'Age'],
      dtype='object')

#### Fit the best model on the whole data

In [28]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('POLY', poly),('LR',lr)])
pipe.fit(X,y)

### Inferencing
#### Generate the predicitons with the best model

In [29]:
predictions = pipe.predict(test)
predictions

array([  39.11015739,   -6.21614465,  107.15163234, -165.567413  ,
        156.19113074, 1044.44304327,  110.16737577,  142.0640814 ,
        342.42362709,  879.45243757,   62.44577774,  448.73136952,
       -116.59107538,  477.86228734])

In [30]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('POLY', poly),('LR',lr)])
kfold = KFold(n_splits=5, shuffle=True, random_state=23)
results = cross_val_score(pipe, X, y, cv=kfold)
results

NameError: name 'KFold' is not defined