In [19]:
import numpy as np
import pandas as pd
from pathlib import Path
import os

### Question 1
#### What's the version of NumPy that you installed?

In [2]:
np.__version__

'1.19.5'

### Question 2
#### What's the version of Pandas?

In [4]:
pd.__version__

'1.3.2'

### Reading car-price.csv into a pandas dataframe

In [138]:
data_path = Path("Data/car-price.csv")
full_data_path = Path.cwd().parents[1].joinpath(data_path)
df = pd.read_csv(full_data_path)

In [139]:
df.head(3)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350


In [140]:
df["Make"].unique()

array(['BMW', 'Audi', 'FIAT', 'Mercedes-Benz', 'Chrysler', 'Nissan',
       'Volvo', 'Mazda', 'Mitsubishi', 'Ferrari', 'Alfa Romeo', 'Toyota',
       'McLaren', 'Maybach', 'Pontiac', 'Porsche', 'Saab', 'GMC',
       'Hyundai', 'Plymouth', 'Honda', 'Oldsmobile', 'Suzuki', 'Ford',
       'Cadillac', 'Kia', 'Bentley', 'Chevrolet', 'Dodge', 'Lamborghini',
       'Lincoln', 'Subaru', 'Volkswagen', 'Spyker', 'Buick', 'Acura',
       'Rolls-Royce', 'Maserati', 'Lexus', 'Aston Martin', 'Land Rover',
       'Lotus', 'Infiniti', 'Scion', 'Genesis', 'HUMMER', 'Tesla',
       'Bugatti'], dtype=object)

In [141]:
df.describe()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,highway MPG,city mpg,Popularity,MSRP
count,11914.0,11845.0,11884.0,11908.0,11914.0,11914.0,11914.0,11914.0
mean,2010.384338,249.38607,5.628829,3.436093,26.637485,19.733255,1554.911197,40594.74
std,7.57974,109.19187,1.780559,0.881315,8.863001,8.987798,1441.855347,60109.1
min,1990.0,55.0,0.0,2.0,12.0,7.0,2.0,2000.0
25%,2007.0,170.0,4.0,2.0,22.0,16.0,549.0,21000.0
50%,2015.0,227.0,6.0,4.0,26.0,18.0,1385.0,29995.0
75%,2016.0,300.0,6.0,4.0,30.0,22.0,2009.0,42231.25
max,2017.0,1001.0,16.0,4.0,354.0,137.0,5657.0,2065902.0


### Question 3
#### What's the average price of BMW cars in the dataset?

In [144]:
df[df["Make"] == "BMW"]["MSRP"].mean()

61546.76347305389

### Question 4
#### Select a subset of cars after year 2015. How many of them have missing values for Engine HP?

In [145]:
df[df["Year"] >= 2015]["Engine HP"].isnull().sum()

51

### Question 5
- Calculate the average "Engine HP" in the dataset.
- Use the fillna method and to fill the missing values in "Engine HP" with the mean value from the previous step.
- Now, calculate the average of "Engine HP" again.
- Has it changed?

In [146]:
mean_hp_before = df["Engine HP"].mean()
mean_hp_before

249.38607007176023

In [147]:
df["Engine HP"].fillna(value=mean_hp_before, inplace=True)

In [148]:
mean_hp_after = df["Engine HP"].mean()
mean_hp_after

249.38607007176023

In [149]:
print(round(mean_hp_before))
print(round(mean_hp_after))

249
249


### Question 6
- Select all the "Rolls-Royce" cars from the dataset.
- Select only columns "Engine HP", "Engine Cylinders", "highway MPG".
- Now drop all duplicated rows using drop_duplicates method (you should get a dataframe with 7 rows).
- Get the underlying NumPy array. Let's call it X.
- Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
- Invert XTX.
- What's the sum of all the elements of the result?

In [83]:
df_rr = df[df["Make"] == "Rolls-Royce"][["Engine HP", "Engine Cylinders", "highway MPG"]]

In [84]:
df_rr.head(3)

Unnamed: 0,Engine HP,Engine Cylinders,highway MPG
2921,325.0,8.0,15
3505,563.0,12.0,19
5275,563.0,12.0,21


In [85]:
df_rr.drop_duplicates(inplace=True)

In [86]:
df_rr

Unnamed: 0,Engine HP,Engine Cylinders,highway MPG
2921,325.0,8.0,15
3505,563.0,12.0,19
5275,563.0,12.0,21
5279,563.0,12.0,20
7443,322.0,12.0,15
7553,453.0,12.0,19
11448,624.0,12.0,21


In [103]:
x = np.array(df_rr)

In [104]:
x

array([[325.,   8.,  15.],
       [563.,  12.,  19.],
       [563.,  12.,  21.],
       [563.,  12.,  20.],
       [322.,  12.,  15.],
       [453.,  12.,  19.],
       [624.,  12.,  21.]])

In [155]:
XTX = x.T.dot(x)

In [156]:
XTX

array([[1.754801e+06, 3.965600e+04, 6.519600e+04],
       [3.965600e+04, 9.280000e+02, 1.500000e+03],
       [6.519600e+04, 1.500000e+03, 2.454000e+03]])

In [157]:
XTX_inv = np.linalg.inv(XTX)

In [158]:
XTX_inv.sum()

0.032212320677486125

### Questions 7
- Create an array y with values [1000, 1100, 900, 1200, 1000, 850, 1300].
- Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
- What's the value of the first element of w?.

In [159]:
y = np.array([1000, 1100, 900, 1200, 1000, 850, 1300])

In [160]:
y

array([1000, 1100,  900, 1200, 1000,  850, 1300])

In [163]:
w = XTX_inv y)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 7 is different from 3)

In [136]:
np.matmul(w.T, y)

array([  3.92189026, -28.82146835,  59.21695232])