In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

In [5]:
import joblib

In [2]:
df = pd.read_csv('./data/50_Startups.csv')

In [7]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [15]:
df.head(3)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39


In [6]:
y = df['Profit']

In [24]:
X = df.drop('Profit', axis = 1)

In [12]:
df['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [16]:
ct = ColumnTransformer([('onehot', OneHotEncoder(), [3])], remainder = 'passthrough')

In [None]:
# 데이터 프로세싱 단계에서
# 문자열을 숫자로 바꾸는 작업, 피처 스케일링 등을 하면 파이프라인으로 처리하는게 좋다.

In [18]:
regressor = LinearRegression()

In [19]:
pipe = Pipeline(steps = [ ('preprocessing', ct), ('modeling', regressor)])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

In [26]:
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('modeling', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [49]:
y_pred = pipe.predict(X_test)

In [53]:
mean_absolute_error(y_test, y_pred)

7469.640826979974

In [54]:
mean_squared_error(y_test, y_pred)

103140891.12164843

In [55]:
r2_score(y_test, y_pred)

0.9438509847889336

In [35]:
regressor2 = RandomForestRegressor(random_state = 20)

In [36]:
pipe2 = Pipeline(steps = [ ('preprocessing', ct), ('modeling', regressor2)]) 

In [37]:
pipe2.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('modeling', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [50]:
y_pred2 = pipe2.predict(X_test)

In [52]:
mean_squared_error(y_test, y_pred2)

156324413.98745888

In [51]:
r2_score(y_test, y_pred2)

0.9148983317538881

In [42]:
regressor3 = XGBRegressor(random_state = 20)

In [43]:
pipe3 = Pipeline(steps = [ ('preprocessing', ct), ('modeling', regressor3)])

In [44]:
pipe3.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('modeling', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [46]:
y_pred3 = pipe3.predict(X_test)

In [47]:
mean_squared_error(y_test, y_pred3)

300487368.955356

In [56]:
r2_score(y_test, y_pred3)

0.836417257338721

In [57]:
joblib.dump(pipe, './model/pipe.pkl')

['./model/pipe.pkl']