In [77]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv("USA_Housing.csv")
df = pd.DataFrame(data)
df

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.80050,1.059034e+06
1,79248.64245,6.002900,6.730821,3.09,40173.07217,1.505891e+06
2,61287.06718,5.865890,8.512727,5.13,36882.15940,1.058988e+06
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1.260617e+06
4,59982.19723,5.040555,7.839388,4.23,26354.10947,6.309435e+05
...,...,...,...,...,...,...
4995,60567.94414,7.830362,6.137356,3.46,22837.36103,1.060194e+06
4996,78491.27543,6.999135,6.576763,4.02,25616.11549,1.482618e+06
4997,63390.68689,7.250591,4.805081,2.13,33266.14549,1.030730e+06
4998,68001.33124,5.534388,7.130144,5.44,42625.62016,1.198657e+06


In [3]:
# dividing the dataset into input and output features
x = df.drop(columns = ['Price']).values
y = df['Price'].values.reshape(-1, 1)

In [4]:
# normalizing/standardizing the values of input features
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [27]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_beta = None
best_r2 = -np.inf
r2_scores = []

In [30]:
for fold, (train_index, test_index) in enumerate(kf.split(x)):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    x_train_bias = np.c_[np.ones((x_train.shape[0], 1)), x_train]
    x_test_bias = np.c_[np.ones((x_test.shape[0], 1)), x_test]

    beta = np.linalg.inv(x_train_bias.T @ x_train_bias) @ x_train_bias.T @ y_train

    y_pred = x_test_bias @ beta

    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    if r2>best_r2:
        best_r2 = r2
        best_beta = beta

print(r2_scores)
print(best_r2)
print(best_beta)

[0.9179971706985152, 0.9145677884802818, 0.9116116385364474, 0.9193091764960789, 0.9243869413350311]
0.9243869413350311
[[-2.63157196e+06]
 [ 2.16033277e+01]
 [ 1.65386268e+05]
 [ 1.20424768e+05]
 [ 6.34893338e+02]
 [ 1.51806193e+01]]


In [34]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

x_train_bias = np.c_[np.ones((x_train.shape[0], 1)), x_train]
x_test_bias = np.c_[np.ones((x_test.shape[0], 1)), x_test]

y_train_pred = x_train_bias @ best_beta
y_test_pred = x_test_bias @ best_beta

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# we compare values of r1_score & r2_score if they are close means model works well
print(train_r2)
print(test_r2)

0.9210965848949021
0.9108982123504624


In [30]:
#Q2
#reading data
data = pd.read_csv("USA_Housing.csv")
df = pd.DataFrame(data)
df
# intiliasing the input and output variable
x = df.drop(columns = ["Price"]).values
y = df['Price'].values

In [17]:
# dividing the dataset into train, val, test
x_train, x_temp, y_train, y_temp = train_test_split(x, y, train_size=0.56, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=30/44, random_state=42)
print(x_train.shape)
print(x_test.shape)
print(x_val.shape)

(2800, 5)
(1500, 5)
(700, 5)


In [18]:
#standardization of values
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_val = scaler.transform(x_val)

In [19]:
#adding intercept
x_train = np.c_[np.ones((x_train.shape[0], 1)), x_train]
x_test = np.c_[np.ones((x_test.shape[0], 1)), x_test]
x_val = np.c_[np.ones((x_val.shape[0])), x_val]

In [20]:
#executign gradient descent
def gradient_descent(x, y, alpha, iters):
    m, n = x.shape
    theta = np.zeros(n)
    for i in range(iters):
        prediction = x.dot(theta)
        error = prediction - y
        gradient = (1/m)*x.T.dot(error)
        theta = theta - alpha*gradient
    return theta

In [25]:
#calc r2 score for each learnig rate
learning_rates = [0.001, 0.01, 0.1, 1]
theta = []
r2_scores_test = []
r2_scores_val = []
for alpha in learning_rates:
    t = gradient_descent(x_train, y_train, alpha, 1000)
    theta.append(t)
    y_pred_test = x_test.dot(t)
    y_pred_val = x_val.dot(t)
    r2_scores_test.append(r2_score(y_test, y_pred_test))
    r2_scores_val.append(r2_score(y_val, y_pred_val))

In [29]:
#theta with max r2_scores_val
theta[np.argmax(r2_scores_val)]

array([1225106.34781021,  231827.54854547,  166006.22902472,
        120763.07797071,    2922.26769971,  152609.02782229])

In [42]:
#Q3
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, names = ["symboling", "normalized_losses",
"make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels",
"engine_location", "wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
"compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"] )
df = pd.DataFrame(df)
df

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


In [43]:
df.replace('?', np.nan, inplace = True)
df

df.dropna(subset=['price'], inplace=True)
df.isnull().sum()

symboling             0
normalized_losses    37
make                  0
fuel_type             0
aspiration            0
num_doors             2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_cylinders         0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 0
dtype: int64

In [44]:
from sklearn.impute import SimpleImputer
columns_with_nan = ['normalized_losses', 'bore', 'stroke', 'horsepower', 'peak_rpm']
imputer1 = SimpleImputer(missing_values=np.nan, strategy="mean")
df[columns_with_nan] = imputer1.fit_transform(df[columns_with_nan])
imputer2 = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
df[['num_doors']] = imputer2.fit_transform(df[['num_doors']])

In [45]:
map_to_num = {"one": 1, "two": 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven':7, 'eight':8, 'twelve':12}
df['num_cylinders'] = df['num_cylinders'].map(map_to_num)
df['num_doors'] = df['num_doors'].map(map_to_num)

In [46]:
df = pd.get_dummies(df, columns = ['body_style', 'drive_wheels'])

In [49]:
label_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [65]:
df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in x else 0)

In [54]:
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in x else 0)

In [68]:
df.head(5)

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,width,...,highway_mpg,price,body_style_convertible,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,3,122.0,0,1,0,2,0,88.6,168.8,64.1,...,27,13495,True,False,False,False,False,False,False,True
1,3,122.0,0,1,0,2,0,88.6,168.8,64.1,...,27,16500,True,False,False,False,False,False,False,True
2,1,122.0,0,1,0,2,0,94.5,171.2,65.5,...,26,16500,False,False,True,False,False,False,False,True
3,2,164.0,1,1,0,4,0,99.8,176.6,66.2,...,30,13950,False,False,False,True,False,False,True,False
4,2,164.0,1,1,0,4,0,99.4,176.6,66.4,...,22,17450,False,False,False,True,False,True,False,False


In [67]:
x = df.drop(columns = ['price']).values
x
y = df['price'].values.reshape(-1, 1)
y

x_train, x_test, y_train, y_test = train_test_split(x, y , train_size=0.7, random_state=42)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [81]:
lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(mse)
print(r2)

17151884.34801706
0.819057028143646


In [82]:
pca = PCA(n_components=0.95, random_state = 42)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)
lr_pca = LinearRegression()
lr.fit(x_train_pca, y_train)

y_pred = lr.predict(x_test_pca)

mse_pca = mean_squared_error(y_test, y_pred)
r2_pca = r2_score(y_test, y_pred)
print(mse_pca)
print(r2_pca)

17909762.246648006
0.8110618320182486
