# 1. Headfiles

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 2. Data Preprocess

## check the data info

In [2]:
# load and shape 
df = pd.read_csv('archive/concrete.csv')
# print(df)
print(f'============ shapes ============')
print(f'shape: {df.shape}')
print(f'============ infos ============')
df.info()
# print(df.describe())

shape: (1030, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Cement               1030 non-null   float64
 1   BlastFurnaceSlag     1030 non-null   float64
 2   FlyAsh               1030 non-null   float64
 3   Water                1030 non-null   float64
 4   Superplasticizer     1030 non-null   float64
 5   CoarseAggregate      1030 non-null   float64
 6   FineAggregate        1030 non-null   float64
 7   Age                  1030 non-null   int64  
 8   CompressiveStrength  1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


## check missing data and duplicated lines

In [3]:
print(f'============ has duplicates ===========')
print(f'dataset has duplicates: {df.duplicated().any()}')
print(f'There are {df.duplicated().sum()} duplicated lines')
print(f'============ has missing data ===========')
print(f'dataset has missing data: {df.isna().any().any()}')
print(df.isna().sum())

dataset has duplicates: True
There are 25 duplicated lines
dataset has missing data: False
Cement                 0
BlastFurnaceSlag       0
FlyAsh                 0
Water                  0
Superplasticizer       0
CoarseAggregate        0
FineAggregate          0
Age                    0
CompressiveStrength    0
dtype: int64


## clean missing and duplicated data

In [4]:
df_clean = df.drop_duplicates()
print(f'There are {df_clean.duplicated().sum()} duplicated lines after cleaning')

There are 0 duplicated lines after cleaning


## check outliers

In [5]:
# zscore 
df_zscore = df_clean.copy()
thresh = 3.0
df_zscore = (df_zscore - df_zscore.mean()) / df_zscore.std()
df_outliner = df_zscore.abs() > thresh
# print(df_outliner)
row_has_outlier = df_outliner.any(axis=1)
print(f'There are {row_has_outlier.sum()} lines having outliers(thresh={thresh})')
df_final = df_clean.loc[~row_has_outlier].copy()
print(f"shape of dataset after cleaning outliers: {df_final.shape}")

There are 49 lines having outliers(thresh=3.0)
shape of dataset after cleaning outliers: (956, 9)


# 3. Train the model

## split datasets

In [6]:
X = df_final.drop(columns=['CompressiveStrength']).values
y = df_final['CompressiveStrength'].values
print(f'input shape: {X.shape}')
print(f'output shape: {y.shape}')

seed = 32
# split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state=seed
)
print(f'train input shape:  {X_train.shape}')
print(f'train output shape: {X_test.shape}')
print(f'test input shape:   {y_train.shape}')
print(f'test output shape:  {y_test.shape}')

input shape: (956, 8)
output shape: (956,)
train input shape:  (764, 8)
train output shape: (192, 8)
test input shape:   (764,)
test output shape:  (192,)


- task b: Calculate the linear regression model parameters that minimize MSE on the training data set.

In [7]:
model = LinearRegression()
model.fit(X_train, y_train)
print(f"bias: {model.intercept_}")
print(f"weights: {model.coef_}")
print(f"number of parameters: {len(model.coef_) + 1}")

bias: 6.86616680082108
weights: [ 0.11020514  0.08777844  0.0629237  -0.17754718  0.38681716  0.0040966
  0.00714792  0.21685304]
number of parameters: 9


# 4. Test the model

- task c. Report the test MSE of your linear regression model.
- task d. Report the R2 coefficient of your model on the test data set.

In [8]:
y_test_pred = model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f"MSE: {mse_test}")
r2_test = r2_score(y_test, y_test_pred)
print(f"R2: {r2_test}")

MSE: 76.11834841285956
R2: 0.723595456980569


# task e. data normalization and compare the results

In [9]:
X = df_final.drop(columns=['CompressiveStrength']).values
y = df_final['CompressiveStrength'].values
X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)

seed = 32
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size = 0.2, random_state=seed
)

model = LinearRegression()
model.fit(X_train, y_train)
print(f"bias: {model.intercept_}")
print(f"weights: {model.coef_}")
print(f"number of parameters: {len(model.coef_) + 1}")

y_test_pred = model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f'MSE scaled: {mse_test}')
r2_test = r2_score(y_test, y_test_pred)
print(f'R2 scaled: {r2_test}')

bias: 34.62915770294921
weights: [11.26465128  7.46024179  4.0634448  -3.56887083  2.09187325  0.31750076
  0.55576     8.01509665]
number of parameters: 9
MSE scaled: 76.11834841285958
R2 scaled: 0.7235954569805689
