<a href="https://colab.research.google.com/github/Reyuliandespa/Reyuliandespa/blob/main/LOWESS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installing Libraries**

In [6]:
import pandas as pd # for data manipulation
import numpy as np # for data manipulation
from sklearn.linear_model import LinearRegression # to build a LR model for comparison
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization 
import statsmodels.api as sm # to build a LOWESS model
from scipy.interpolate import interp1d # for interpolation of new data points
import matplotlib.pyplot as plt



**Import Dataset**

In [7]:
# Read in data
df = pd.read_csv('/content/data.csv')
# Print Dataframe
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,0.98765,0.0,12.50,0,0.561,6.980,89.0,2.0980,3,320,23.0,396.00,12.00,12.0
507,0.23456,0.0,12.50,0,0.561,6.980,76.0,2.6540,3,320,23.0,343.00,25.00,32.0
508,0.44433,0.0,12.50,0,0.561,6.123,98.0,2.9870,3,320,23.0,343.00,21.00,54.0
509,0.77763,0.0,12.70,0,0.561,6.222,34.0,2.5430,3,329,23.0,343.00,76.00,67.0


**Validation Dataset**

In [23]:
#Menentukan Dimension dari Dataset
df.shape

(511, 14)

In [22]:
#Menentukan Informasi dari Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     511 non-null    float64
 1   ZN       511 non-null    float64
 2   INDUS    511 non-null    float64
 3   CHAS     511 non-null    int64  
 4   NOX      511 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      511 non-null    float64
 7   DIS      511 non-null    float64
 8   RAD      511 non-null    int64  
 9   TAX      511 non-null    int64  
 10  PTRATIO  511 non-null    float64
 11  B        511 non-null    float64
 12  LSTAT    511 non-null    float64
 13  MEDV     511 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 56.0 KB


In [21]:
#Menentukan nama kolom yang ada dalam Dataset
df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

**Descriptive Dataset**

In [11]:
#Deskriptif dari dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CRIM,511.0,3.584139,8.564433,0.00632,0.082325,0.26169,3.621175,88.9762
ZN,511.0,11.252446,23.234838,0.0,0.0,0.0,12.5,100.0
INDUS,511.0,11.151096,6.828175,0.46,5.19,9.69,18.1,27.74
CHAS,511.0,0.068493,0.252838,0.0,0.0,0.0,0.0,1.0
NOX,511.0,0.554757,0.11531,0.385,0.449,0.538,0.624,0.871
RM,506.0,6.287589,0.703802,3.561,5.8855,6.209,6.62975,8.78
AGE,511.0,68.616243,28.09913,2.9,45.05,77.3,94.05,100.0
DIS,511.0,3.783876,2.098631,1.1296,2.10035,3.1523,5.118,12.1265
RAD,511.0,9.485323,8.688469,1.0,4.0,5.0,24.0,24.0
TAX,511.0,407.440313,167.903532,187.0,279.5,330.0,666.0,711.0


**Cleaning Dataset**

In [12]:
#Mengecek Missing Value Dataset
df.isnull().sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of       CRIM     ZN  INDUS   CHAS    NOX     RM    AGE    DIS    RAD    TAX  \
0    False  False  False  False  False  False  False  False  False  False   
1    False  False  False  False  False  False  False  False  False  False   
2    False  False  False  False  False  False  False  False  False  False   
3    False  False  False  False  False  False  False  False  False  False   
4    False  False  False  False  False  False  False  False  False  False   
..     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
506  False  False  False  False  False  False  False  False  False  False   
507  False  False  False  False  False  False  False  False  False  False   
508  False  False  False  False  False  False  False  False  False  False   
509  False  False  False  False  False  False  False  False  False  False   
510  False  False  False  False  False  False  False  False  False  False   

     PTRATIO 

**Data Analysis**

In [13]:
# Create a scatter plot
fig = px.scatter(df, x=df['DIS'], y=df['LSTAT'], 
                 opacity=0.8, color_discrete_sequence=['black'])

# Change chart background color
fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

# Set figure title
fig.update_layout(title=dict(text="Boston House Prices Based on DIS-Weighted Distance to Five Downtown Boston", 
                             font=dict(color='black')))

# Update marker size
fig.update_traces(marker=dict(size=3))

fig.show()

In [17]:
# ------- Select variables -------
# x values for Linear Regression
X=df['DIS'].values.reshape(-1,1) # Note, we need X to be a 2D array, hence reshape
# x values for LOWESS
x=df['DIS'].values 
# y values for both
y=df['LSTAT'].values


# ------- Linear Regression -------
# Define and fit the model
model1 = LinearRegression()
LR = model1.fit(X, y)

# Predict a few points with Linear Regression model for the grpah
# Create 20 evenly spaced points from smallest X to largest X
x_range = np.linspace(X.min(), X.max(), 20) 
# Predict y values for our set of X values
y_range = model1.predict(x_range.reshape(-1, 1))

# ------- LOWESS -------
# Generate y_hat values using lowess, try a couple values for hyperparameters
lowess = sm.nonparametric.lowess
y_hat1 = lowess(y, x) # note, default frac=2/3
y_hat2 = lowess(y, x, frac=1/5)

**Modelling**

In [18]:
# Create a scatter plot
fig = px.scatter(df, x=df['DIS'], y=df['LSTAT'], 
                 opacity=0.8, color_discrete_sequence=['black'])

# Add the prediction line
fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Linear Regression', line=dict(color='limegreen')))
fig.add_traces(go.Scatter(x=y_hat1[:,0], y=y_hat1[:,1], name='LOWESS, frac=2/3', line=dict(color='red')))
fig.add_traces(go.Scatter(x=y_hat2[:,0], y=y_hat2[:,1], name='LOWESS, frac=1/5', line=dict(color='orange')))

# Change chart background color
fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')
# Set figure title
fig.update_layout(title=dict(text="Boston Home Prices Based on DIS-Weighted Distance to Five Boston Job Centers", 
                             font=dict(color='black')))

# Update marker size
fig.update_traces(marker=dict(size=3))

fig.show()

**Predicting new values**

In [19]:
# ------- Define interploation functions -------
# Linear - draws a line between the two nearest points and calculates y value based on the slope of that line
f_linear = interp1d(y_hat1[:,0], y=y_hat1[:,1], bounds_error=False, kind='linear', fill_value='extrapolate') 
# Nearest - finds the nearest available point and takes its y value
f_nearest = interp1d(y_hat1[:,0], y=y_hat1[:,1], bounds_error=False, kind='nearest', fill_value='extrapolate') 

# Create a new set of points with x values 
xnew = [300, 600, 900, 1200, 1500, 1800, 2100, 6400]

# Find y values based on the two different interpolation methods
ynew_linear = f_linear(xnew)
ynew_nearest = f_nearest(xnew)

# Print results
print(ynew_linear)
print(ynew_nearest)

[ -106.59069412  -223.71167555  -340.83265697  -457.95363839
  -575.07461981  -692.19560123  -809.31658265 -2488.05064968]
[5.79606203 5.79606203 5.79606203 5.79606203 5.79606203 5.79606203
 5.79606203 5.79606203]
