# Linear Regression

In [1]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,confusion_matrix
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv(r"C:\Users\samba\Downloads\concrete_data.csv")
data.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
data.info()
data = data.astype("float64")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   cement                         1030 non-null   float64
 1   blast_furnace_slag             1030 non-null   float64
 2   fly_ash                        1030 non-null   float64
 3   water                          1030 non-null   float64
 4   superplasticizer               1030 non-null   float64
 5   coarse_aggregate               1030 non-null   float64
 6   fine_aggregate                 1030 non-null   float64
 7   age                            1030 non-null   int64  
 8   concrete_compressive_strength  1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [4]:
data.isna().sum()

cement                           0
blast_furnace_slag               0
fly_ash                          0
water                            0
superplasticizer                 0
coarse_aggregate                 0
fine_aggregate                   0
age                              0
concrete_compressive_strength    0
dtype: int64

In [5]:
data.isnull().sum()

cement                           0
blast_furnace_slag               0
fly_ash                          0
water                            0
superplasticizer                 0
coarse_aggregate                 0
fine_aggregate                   0
age                              0
concrete_compressive_strength    0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   cement                         1030 non-null   float64
 1   blast_furnace_slag             1030 non-null   float64
 2   fly_ash                        1030 non-null   float64
 3   water                          1030 non-null   float64
 4   superplasticizer               1030 non-null   float64
 5   coarse_aggregate               1030 non-null   float64
 6   fine_aggregate                 1030 non-null   float64
 7   age                            1030 non-null   float64
 8   concrete_compressive_strength  1030 non-null   float64
dtypes: float64(9)
memory usage: 72.5 KB


In [7]:
features = list(set(data.columns)-set(["concrete_compressive_strength"]))
print(features)
target = list(["concrete_compressive_strength"])
print(target)

x = data[features].values
y = data[target].values

train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = 0.3,random_state=5)

scaler = StandardScaler()

['water', 'cement', 'coarse_aggregate', 'age', 'superplasticizer', 'blast_furnace_slag', 'fine_aggregate ', 'fly_ash']
['concrete_compressive_strength']


In [8]:
scaler.fit(train_x)

StandardScaler()

In [9]:
train_x =scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [10]:
model = LinearRegression()

In [11]:
model.fit(train_x,train_y.ravel())

LinearRegression()

In [12]:
R_square_train = model.score(train_x,train_y)

In [13]:
R_square_train

0.6339399977109412

In [14]:
intercept = model.intercept_
slope = model.coef_

In [15]:
print("intercept:",intercept)
print("slope:",slope)

intercept: 35.97902912621356
slope: [-3.10703339 13.22653338  1.69183328  7.63242262  1.64104295  8.94103147
  1.53658253  5.74882429]


In [16]:
R_square_test = model.score(test_x,test_y)

In [17]:
R_square_test

0.5558268797595711

# Random forest regression

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
import matplotlib.pyplot as plt

In [19]:
from sklearn.metrics import mean_squared_error
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
df = pd.read_csv(r"C:\Users\samba\Downloads\concrete_data.csv")

In [20]:
features = list(set(df.columns)-set(["concrete_compressive_strength"]))
target = list(["concrete_compressive_strength"])

x = df[features].values
y = df[target].values

In [21]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = 0.1,random_state=1)

In [22]:
base_pred = df[target].mean().values
base_pred = np.repeat(base_pred,len(test_y))

base_rmse = (mean_squared_error(test_y,base_pred))**0.5

In [23]:
scaler = StandardScaler()
scaler.fit_transform(train_x)
train_x =scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [24]:
rf = RandomForestRegressor(n_estimators=100,random_state=1)
rf.fit(train_x,train_y.ravel())

RandomForestRegressor(random_state=1)

In [25]:
pred_rf = rf.predict(test_x)


In [26]:
rf_rmse = (mean_squared_error(test_y,pred_rf))**0.5
rf_rmse

4.580444544057423

In [27]:
rf.score(train_x,train_y)

0.9846105477565998

In [28]:
rf.score(test_x,test_y)

0.9088220988390258

# SVM

In [29]:
df = pd.read_csv(r"C:\Users\samba\Downloads\concrete_data.csv")

In [30]:
features = list(set(df.columns)-set(["concrete_compressive_strength"]))
target = list(["concrete_compressive_strength"])

x = df[features].values
y = df[target].values

In [31]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = 0.1,random_state=1)

In [32]:
scaler = StandardScaler()
scaler.fit_transform(train_x)
train_x =scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [33]:
from sklearn.svm import SVR
model = SVR(kernel = "rbf")
model.fit(train_x,train_y.ravel())

SVR()

In [34]:
model.score(test_x,test_y)

0.6524527129837586

In [35]:
model.score(train_x,train_y)

0.6863004756651223

# KNN

In [36]:
from sklearn.neighbors import KNeighborsRegressor


In [37]:
model  = KNeighborsRegressor(n_neighbors =6)
model.fit(train_x,train_y)
model.score(test_x,test_y)

0.7196285088963874

In [38]:
model.score(train_x,train_y)

0.8209063097816742

In [39]:
for i in range(1,100):
    model = KNeighborsRegressor(n_neighbors =i )
    model.fit(train_x,train_y)

    print(i,model.score(test_x,test_y))

1 0.5745499806773053
2 0.5612672515401487
3 0.6797695772362706
4 0.6595834211938536
5 0.7007530700488811
6 0.7196285088963874
7 0.7145662126422686
8 0.7038360846900964
9 0.6924054418474115
10 0.6895367689527846
11 0.6760820737101357
12 0.6625628256455137
13 0.6459710683964431
14 0.6469895221280005
15 0.648196080736581
16 0.6375640512176736
17 0.6470390421782763
18 0.6493535462411626
19 0.6431442054469323
20 0.6468957914061655
21 0.6446744355205801
22 0.6375285332691012
23 0.6303167080228993
24 0.6232044202344847
25 0.6220175832395417
26 0.6173378822756577
27 0.6110486508631721
28 0.6066432164071336
29 0.6008925801355672
30 0.5873755640860618
31 0.5891258154525786
32 0.5944803545697341
33 0.5995801820782416
34 0.6006075725879825
35 0.5911434130071479
36 0.5833660514306873
37 0.5804230963593562
38 0.5800254201715709
39 0.5787578594944263
40 0.5807629438570643
41 0.5800568487869059
42 0.5806567104549732
43 0.5790087539476754
44 0.5778834010841085
45 0.5706314298646721
46 0.570593305732163