In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.decomposition import PCA

In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# 1.0 Mount your google drive 
#      so that you can access data files 
#      on your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [12]:
train=pd.read_csv('/content/drive/MyDrive/Term 4/BDDA/Project work/All state claims/data/train.csv')
test=pd.read_csv('/content/drive/MyDrive/Term 4/BDDA/Project work/All state claims/data/test.csv')
sample_submission=pd.read_csv('/content/drive/MyDrive/Term 4/BDDA/Project work/All state claims/data/sample_submission.csv')

In [13]:
X=train.drop(columns='loss')
y=train.loss

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(150654, 131) (37664, 131) (150654,) (37664,)


In [15]:
num_feat=X_train.select_dtypes(include='number').columns.to_list()
cat_feat=X_train.select_dtypes(exclude='number').columns.to_list()

In [16]:
num_pipe=Pipeline([
    ('impute',SimpleImputer(strategy='median')),
    ('scale',StandardScaler())
])

cat_pipe= Pipeline([
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encode',OneHotEncoder(handle_unknown='ignore'))
])

ct=ColumnTransformer(remainder='drop',
                    transformers=[
                        ('num',num_pipe, num_feat),
                        ('cat', cat_pipe, cat_feat)]
                    )
model_adaboost=Pipeline([
    ('transformer', ct),
    ('predictor', AdaBoostRegressor())
])

model_xgb=Pipeline([
    ('transformer', ct),
    ('predictor', XGBRegressor())
])


In [17]:
pipelines=[model_adaboost,model_xgb]

In [18]:

# Fit the pipelines
for pipe in pipelines:
	pipe.fit(X_train, y_train)



In [20]:
# Dictionary of pipelines and regressor types for ease of reference
pipe_dict = {0: 'Adaboost Regressor', 1: 'Extreme Gradient boost Regressor'}

In [21]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Adaboost Regressor Test Accuracy: -3.540383042758089
Extreme Gradient boost Regressor Test Accuracy: 0.5451813870037665


In [22]:
y_pred=model_xgb.predict(test)

In [23]:
submission=pd.DataFrame({'id':sample_submission.id, 'loss': y_pred})
submission.to_csv('/content/drive/MyDrive/Term 4/BDDA/Project work/All state claims/data/submission_xgb.csv', index=False)
result=pd.read_csv('/content/drive/MyDrive/Term 4/BDDA/Project work/All state claims/data/submission_xgb.csv')
result.head()

Unnamed: 0,id,loss
0,4,1609.0675
1,6,2218.7783
2,9,10002.085
3,12,6226.2305
4,15,1216.6356


In [24]:
y_pred=model_adaboost.predict(test)

In [25]:
submission=pd.DataFrame({'id':sample_submission.id, 'loss': y_pred})
submission.to_csv('/content/drive/MyDrive/Term 4/BDDA/Project work/All state claims/data/submission_adaboost.csv', index=False)
result=pd.read_csv('/content/drive/MyDrive/Term 4/BDDA/Project work/All state claims/data/submission_adaboost.csv')
result.head()

Unnamed: 0,id,loss
0,4,7381.32347
1,6,8175.75611
2,9,11997.110289
3,12,11554.128439
4,15,7381.32347
