In [1]:
import pandas as pd
import numpy as np
import sqlite3
import random
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.linalg import lstsq
from sklearn.preprocessing import scale
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy import stats


In [2]:
data = pd.read_csv("task-2.csv" , sep='\t')

# The second task consists of fitting a model to a set of data points that contains outliers. 
- Train / test * 3
- MAE score : Fit a linear regression with the outliers
- MAE score : Fit a huber regression with the outliers
- MAE score : Fit a linear regression without the outliers

### Fit a linear regression with the outliers

#### Here I am cleaning the data

In [3]:
data_2 = data['x1,x2,x3,y'].str.split(",", expand = True).add_prefix('col_')
data_2  = data_2.rename(columns = {'col_3': 'y'}, inplace = False)
data_2["col_0"] = pd.to_numeric(data_2["col_0"])
data_2["col_1"] = pd.to_numeric(data_2["col_1"])
data_2["col_2"] = pd.to_numeric(data_2["col_2"])
data_2  = data_2.rename(columns = {'col_0': 'x1'}, inplace = False)
data_2  = data_2.rename(columns = {'col_1': 'x2'}, inplace = False)
data_2  = data_2.rename(columns = {'col_2': 'x3'}, inplace = False)
data_2["y"] = pd.to_numeric(data_2["y"])
display(data_2)

Unnamed: 0,x1,x2,x3,y
0,7.892,318.818,162.970,2112.420
1,8.830,303.180,181.399,2096.231
2,13.811,296.231,145.849,2067.045
3,12.863,325.830,167.996,2269.262
4,13.698,254.035,171.892,1966.604
...,...,...,...,...
995,7.573,305.690,150.542,2033.060
996,5.948,331.843,182.158,2101.776
997,7.568,385.722,159.005,2429.810
998,15.332,335.927,166.665,2431.463


#### Data Randomization

In [4]:
data_3 = data_2.sample(frac=1)

#### Splitting the data

In [5]:
from sklearn.model_selection import train_test_split
X = data_3.drop('y', axis = 1).values
y = data_3.y.values                                
                                       
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.20, random_state=42)

#### Fitting the model

In [6]:
reg = linear_model.LinearRegression()

In [7]:
reg.fit(X_train, y_train)

LinearRegression()

#### Train the model

In [8]:
train_reg = reg.predict(X_train)
test_reg = reg.predict(X_validation)
X_reg = reg.predict(X)

In [9]:
train_rmse_2 = np.sqrt(mean_squared_error(train_reg, y_train))
test_rmse_2 = np.sqrt(mean_squared_error(test_reg, y_validation))
test_rmse_test = np.sqrt(mean_squared_error(X_reg, y))
print('Train RMSE')
print(train_rmse_2)
print('Test RMSE')
print(test_rmse_2)
print('Test RMSE')
print(test_rmse_test)

Train RMSE
145.58865327332748
Test RMSE
157.84891613823933
Test RMSE
148.12191206825653


#### Showing the results

In [10]:
result_linear_outliers = mean_absolute_error(X_reg, y)
display(result_linear_outliers)

114.20970663293755

### Fit a huber regression with the outliers

In [11]:
from sklearn.model_selection import train_test_split
X = data_3.drop('y', axis = 1).values
y = data_3.y.values                                
                                       
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.20, random_state=42)

#### Fitting the model HuberRegressor

In [12]:
reg = linear_model.HuberRegressor()

In [13]:
reg.fit(X_train, y_train)

HuberRegressor()

#### Train the model

In [14]:
train_reg = reg.predict(X_train)
test_reg = reg.predict(X_validation)
X_reg = reg.predict(X)

In [15]:
train_rmse_2 = np.sqrt(mean_squared_error(train_reg, y_train))
test_rmse_2 = np.sqrt(mean_squared_error(test_reg, y_validation))
test_rmse_test = np.sqrt(mean_squared_error(X_reg, y))
print('Train RMSE')
print(train_rmse_2)
print('Test RMSE')
print(test_rmse_2)
print('Test RMSE')
print(test_rmse_test)

Train RMSE
210.82007222016253
Test RMSE
334.317718784374
Test RMSE
240.64444664144898


#### Get the results

In [16]:
result_huber_outliers = mean_absolute_error(X_reg, y)

### Fitting the linear regression without the outlier

#### Filtering outliers

In [17]:
data_3 = data_2.sample(frac=1)
display(data_3)

Unnamed: 0,x1,x2,x3,y
418,10.078,291.242,166.263,2021.911
329,14.764,296.431,164.224,2159.377
278,12.236,296.181,169.016,2095.413
705,15.372,276.343,165.649,2065.954
114,12.544,267.038,166.504,2067.543
...,...,...,...,...
349,13.564,315.191,168.595,2289.325
58,11.177,336.730,158.043,2303.636
355,10.637,359.531,175.415,2467.904
93,13.062,333.718,170.173,2252.324


In [18]:
data_3.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
x1,1000.0,12.32219,8.455445,-49.652,9.7315,12.1565,14.18925,100.692
x2,1000.0,319.329936,68.823726,-375.335,296.80425,320.8575,341.74925,899.041
x3,1000.0,158.595832,25.032233,-146.921,152.2155,159.457,166.3565,344.797
y,1000.0,2204.251068,158.109451,1733.474,2090.981,2204.511,2312.266,2713.866


In [19]:
for i in data_3:
    data_3 = data_3.loc[(data_3[i] > data_3[i].quantile(0.2) - (1.5*(data_3[i].quantile(0.8)-data_3[i].quantile(0.2)))) & (data_3[i] < data_3[i].quantile(0.8) + (1.5*(data_3[i].quantile(0.8)-data_3[i].quantile(0.2))))]
data_4 = data_3

#### Checking changes

In [20]:
data_4.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
x1,950.0,11.989403,3.046165,3.889,9.78425,12.0645,14.049,21.807
x2,950.0,319.442454,30.930711,222.001,297.7085,320.9985,340.427,414.861
x3,950.0,159.524124,10.061288,127.701,152.656,159.5815,166.207,191.869
y,950.0,2202.843472,159.076232,1733.474,2090.04275,2202.635,2312.21375,2713.866


#### 50 lines have been deleted, fitting now the model

In [21]:
from sklearn.model_selection import train_test_split
X = data_4.drop('y', axis = 1).values
y = data_4.y.values                                
                                       
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.20, random_state=42)

In [22]:
reg = linear_model.LinearRegression()


In [23]:
reg.fit(X_train, y_train)

LinearRegression()

#### Fitting the model

In [24]:
train_reg = reg.predict(X_train)
test_reg = reg.predict(X_validation)
X_reg = reg.predict(X)

In [25]:
train_rmse_2 = np.sqrt(mean_squared_error(train_reg, y_train))
test_rmse_2 = np.sqrt(mean_squared_error(test_reg, y_validation))
test_rmse_test = np.sqrt(mean_squared_error(X_reg, y))
print('Train RMSE')
print(train_rmse_2)
print('Test RMSE')
print(test_rmse_2)
print('Test RMSE')
print(test_rmse_test)

Train RMSE
70.55644064486113
Test RMSE
65.96073856196345
Test RMSE
69.66155941051986


#### result of the model

In [26]:
result_linear_whithout_outliers = mean_absolute_error(X_reg, y)
display(result_linear_whithout_outliers)

53.19716216081502

In [27]:
result_linear_outliers

114.20970663293755

In [28]:
result_huber_outliers

103.90240105791605

In [29]:
result_linear_whithout_outliers

53.19716216081502

In [30]:
Outliers have a heavy impact on the results. It is more important to filter well outliers rather than choosing the right regression model

SyntaxError: invalid syntax (<ipython-input-30-bf603e1097f7>, line 1)