# [과제 2] 회귀분석
### - Ch 1, Ch 2를 토대로 자유롭게 회귀분석과 회귀진단을 진행해주세요.
### - 주석으로 설명 및 근거 자세하게 달아주시면 감사하겠습니다. :)

In [1]:
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

# Data 

데이터 출처 : https://www.kaggle.com/avikasliwal/used-cars-price-prediction 

< y > 
* **Price** : The price of the used car in INR Lakhs.



< X > 
* Name : The brand and model of the car
* Location : The location in which the car is being sold or is available for purchase.
* Year : The year or edition of the model.
* Kilometers_Driven : The total kilometres driven in the car by the previous owner(s) in KM.
* Fuel_Type : The type of fuel used by the car. (Petrol, Diesel, Electric, CNG, LPG)
* Transmission : The type of transmission used by the car. (Automatic / Manual)
* Owner_Type : Whether the ownership is Firsthand, Second hand or other.
* Mileage : The standard mileage offered by the car company in kmpl or km/kg
* Engine : The displacement volume of the engine in CC.
* Power : The maximum power of the engine in bhp.
* Seats : The number of seats in the car.
* New_Price : The price of a new car of the same model.

In [2]:
# Load Data 
data = pd.read_csv("assignment2_data.csv").iloc[:,1:]
data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


## 결측치 확인

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB


In [4]:
data.shape

(6019, 13)

In [5]:
# 간단한 분석을 위해 날리기
data = data.drop(columns='New_Price').dropna()
data.shape

(5975, 12)

In [6]:
y = data['Price']

In [7]:
numeric_columns = data.select_dtypes(include=['int64']).columns.to_list()
object_columns = data.select_dtypes(include=['object']).columns.to_list()

In [8]:
x = pd.concat([data[numeric_columns], pd.get_dummies(data[object_columns])], axis=1)

## Data Split

In [9]:
x.shape, y.shape

((5975, 2836), (5975,))

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=2020)

In [12]:
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((4780, 2836), (1195, 2836), (4780,), (1195,))

## Modeling

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

### Linear Regression

In [14]:
LR = LinearRegression()

In [15]:
LR.fit(train_x, train_y)

LinearRegression()

In [16]:
pred_lr = LR.predict(test_x)

In [18]:
mean_squared_error(test_y, pred_lr)

511765176956.9023

### Ridge

In [19]:
ridge = Ridge()

In [20]:
ridge.fit(train_x, train_y)

Ridge()

In [21]:
pred_ridge = ridge.predict(test_x)

In [22]:
mean_squared_error(test_y, pred_ridge)

32.10872394868262

### Lasso

In [23]:
lasso = Lasso()

In [24]:
lasso.fit(train_x, train_y)

Lasso()

In [25]:
pred_lasso = lasso.predict(test_x)

In [26]:
mean_squared_error(test_y, pred_lasso)

96.02882366116908

### RandomForestRegressor

In [28]:
rf = RandomForestRegressor()

In [29]:
rf.fit(train_x, train_y)

RandomForestRegressor()

In [30]:
pred_rf = rf.predict(test_x)

In [31]:
mean_squared_error(test_y, pred_rf)

32.722962890639906