In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import preprocessing

# Data Loading and preprocessing

In [None]:
#load data
df=pd.read_csv('CE802_P3_Data.csv')
df.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,Target
0,11.85,-258.09,-6.95,2557.29,Low,Europe,7.59,-28.22,-259.22,-1578.12,-141.76,9,6,20.19,169.26,1.64,999.41
1,11.99,-169.71,-16.06,2862.87,Low,Rest,2.57,-13.76,-321.8,-1914.16,-165.83,6,6,9.51,6259.05,0.08,0.0
2,3.48,-317.76,-8.14,2175.81,Medium,UK,1.6,-32.76,-253.54,457.12,-203.87,15,10,7.02,38504.82,6.4,266.29
3,14.75,-394.26,-5.61,1927.86,Low,UK,7.33,-13.42,-149.86,-1972.04,-145.52,6,10,33.99,20883.9,21.98,598.77
4,10.25,-365.46,-10.33,3187.98,High,USA,4.37,-32.66,-224.58,-2022.36,-153.94,12,10,4.29,12095.91,2167.48,396.9


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 17 columns):
F1        1500 non-null float64
F2        1500 non-null float64
F3        1500 non-null float64
F4        1500 non-null float64
F5        1500 non-null object
F6        1500 non-null object
F7        1500 non-null float64
F8        1500 non-null float64
F9        1500 non-null float64
F10       1500 non-null float64
F11       1500 non-null float64
F12       1500 non-null int64
F13       1500 non-null int64
F14       1500 non-null float64
F15       1500 non-null float64
F16       1500 non-null float64
Target    1500 non-null float64
dtypes: float64(13), int64(2), object(2)
memory usage: 199.3+ KB


In [None]:
df.describe(include='all')

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,Target
count,1500.0,1500.0,1500.0,1500.0,1500,1500,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
unique,,,,,5,4,,,,,,,,,,,
top,,,,,Very low,USA,,,,,,,,,,,
freq,,,,,312,407,,,,,,,,,,,
mean,10.757127,-336.28788,-7.285153,2671.68444,,,4.054593,-19.753333,-213.443933,-1334.077107,-158.306867,11.842,7.885333,12.12498,7073.88498,127.205547,922.082533
std,3.02425,87.920436,3.040026,889.404292,,,2.906426,8.366284,59.715228,602.841367,42.241393,5.510107,3.620072,8.681726,13329.106617,1548.308592,1092.331874
min,1.33,-643.65,-16.99,-310.65,,,0.04,-52.26,-450.88,-3435.84,-348.81,0.0,0.0,0.12,-53986.65,0.0,0.0
25%,8.64,-394.38,-9.265,2063.7975,,,1.9075,-23.71,-253.54,-1752.225,-179.38,9.0,6.0,5.835,365.31,0.26,0.0
50%,10.76,-333.87,-7.39,2663.205,,,3.37,-19.79,-213.35,-1326.91,-158.245,12.0,8.0,10.245,7070.715,2.02,429.57
75%,12.88,-277.35,-5.33,3286.7025,,,5.4725,-15.97,-171.71,-932.38,-138.3075,15.0,10.0,16.4775,13207.14,13.61,1530.205


In [None]:
#this cell converts object columns to encodings
col_categorical = df.select_dtypes(include= ['object']).columns
for col in col_categorical:
    df[col] = df[col].astype('category')
# categorical values ==> numeric values
df[col_categorical] = df[col_categorical].apply(lambda x: x.cat.codes)
df.head(5)

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,Target
0,11.85,-258.09,-6.95,2557.29,1,0,7.59,-28.22,-259.22,-1578.12,-141.76,9,6,20.19,169.26,1.64,999.41
1,11.99,-169.71,-16.06,2862.87,1,1,2.57,-13.76,-321.8,-1914.16,-165.83,6,6,9.51,6259.05,0.08,0.0
2,3.48,-317.76,-8.14,2175.81,2,2,1.6,-32.76,-253.54,457.12,-203.87,15,10,7.02,38504.82,6.4,266.29
3,14.75,-394.26,-5.61,1927.86,1,2,7.33,-13.42,-149.86,-1972.04,-145.52,6,10,33.99,20883.9,21.98,598.77
4,10.25,-365.46,-10.33,3187.98,0,3,4.37,-32.66,-224.58,-2022.36,-153.94,12,10,4.29,12095.91,2167.48,396.9


In [None]:
#data formation for training
X=df.drop('Target',axis=1)
X = preprocessing.scale(X)
y=df['Target']

In [None]:
#split data into train and validation data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train) #fit linear regression on training data

y_pred=lr.predict(X_test) #make prediction on validation data

print("MAE",mean_absolute_error(y_test,y_pred)) #mean absolute error
print("MSE",mean_squared_error(y_test,y_pred)) #mean squared error
print("R2_score",r2_score(y_test,y_pred)) #R-square score

MAE 482.87746368596197
MSE 375523.8664282802
R2_score 0.7170938382699268


# Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor=RandomForestRegressor()

rf_regressor.fit(X_train,y_train) #fit Random forest on training data
y_pred=rf_regressor.predict(X_test) #make prediction on validation data

print("MAE",mean_absolute_error(y_test,y_pred)) #mean absolute error
print("MSE",mean_squared_error(y_test,y_pred)) #mean squared error
print("R2_score",r2_score(y_test,y_pred)) #R-square score

MAE 480.71706699999993
MSE 441339.99633027276
R2_score 0.6675103354486573


# Xgboost Regression

In [None]:
import xgboost as xgb

xgb_regressor=xgb.XGBRegressor(max_depth=3,subsample=0.8, colsample_bytree=0.5)
xgb_regressor.fit(X_train,y_train) #fit xgboost regressor on training data

y_pred=xgb_regressor.predict(X_test) #make prediction on validation data

print("MAE",mean_absolute_error(y_test,y_pred)) #mean absolute error
print("MSE",mean_squared_error(y_test,y_pred)) #mean squared error
print("R2_score",r2_score(y_test,y_pred)) #R-square score

MAE 381.3126948908488
MSE 247785.69882372185
R2_score 0.8133271750406542


#  Prediction on a hold-out test set.

In [None]:
#load test data
df_test=pd.read_csv('CE802_P3_Test.csv')
df_test.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,Target
0,11.23,-195.54,-1.19,1468.56,Very high,USA,8.97,-23.62,-249.36,-854.18,-155.2,12,10,12.39,-3480.87,0.04,
1,14.89,-426.24,-1.18,3049.08,Very high,USA,6.33,-39.26,-226.26,-2126.68,-159.42,9,8,5.19,8831.19,43.68,
2,6.76,-493.47,-13.55,3197.13,Very low,USA,1.77,-25.84,-238.3,-2270.78,-212.73,12,10,3.3,-4468.44,0.52,
3,15.12,-320.04,-12.17,2436.0,Very low,Rest,5.42,-17.32,-203.64,-304.24,-100.34,18,12,6.51,22851.6,758.54,
4,10.12,-387.99,-7.11,2800.89,Very low,Europe,1.39,-12.78,-265.16,-1419.76,-137.49,0,14,14.22,24396.09,0.68,


In [None]:
#this cell converts object columns to encodings
col_categorical = df_test.select_dtypes(include= ['object']).columns
for col in col_categorical:
    df_test[col] = df_test[col].astype('category')
# categorical values ==> numeric values
df_test[col_categorical] = df_test[col_categorical].apply(lambda x: x.cat.codes)
df_test.head(5)

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,Target
0,11.23,-195.54,-1.19,1468.56,3,3,8.97,-23.62,-249.36,-854.18,-155.2,12,10,12.39,-3480.87,0.04,
1,14.89,-426.24,-1.18,3049.08,3,3,6.33,-39.26,-226.26,-2126.68,-159.42,9,8,5.19,8831.19,43.68,
2,6.76,-493.47,-13.55,3197.13,4,3,1.77,-25.84,-238.3,-2270.78,-212.73,12,10,3.3,-4468.44,0.52,
3,15.12,-320.04,-12.17,2436.0,4,1,5.42,-17.32,-203.64,-304.24,-100.34,18,12,6.51,22851.6,758.54,
4,10.12,-387.99,-7.11,2800.89,4,0,1.39,-12.78,-265.16,-1419.76,-137.49,0,14,14.22,24396.09,0.68,


In [None]:
X_test=preprocessing.scale(df_test.drop('Target',axis=1))

In [None]:
df_test_pred=xgb_regressor.predict(X_test) #make prediction on test using xgboost model
df_test['Target']=df_test_pred

In [None]:
df_test.to_csv('CE802_P3_Test.csv',index=False) #export data to csv