In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...,...
193568,193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [4]:
df = df.dropna()

In [5]:
x = df.iloc[:, 1:-1].values
x

array([[1.52, 'Premium', 'F', ..., 7.27, 7.33, 4.55],
       [2.03, 'Very Good', 'J', ..., 8.06, 8.12, 5.05],
       [0.7, 'Ideal', 'G', ..., 5.69, 5.73, 3.5],
       ...,
       [0.73, 'Very Good', 'F', ..., 5.72, 5.75, 3.62],
       [0.34, 'Very Good', 'D', ..., 4.45, 4.49, 2.81],
       [0.71, 'Good', 'E', ..., 5.73, 5.71, 3.48]], dtype=object)

In [6]:
y = df.iloc[:, -1].values
y

array([13619, 13387,  2772, ...,  3036,   681,  2258], dtype=int64)

## Encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [8]:
df.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [8]:
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [1,2,3])], remainder="passthrough")

In [9]:
x = np.array(ct.fit_transform(x))

In [10]:
x

array([[0.0, 0.0, 0.0, ..., 7.27, 7.33, 4.55],
       [0.0, 0.0, 0.0, ..., 8.06, 8.12, 5.05],
       [0.0, 0.0, 1.0, ..., 5.69, 5.73, 3.5],
       ...,
       [0.0, 0.0, 0.0, ..., 5.72, 5.75, 3.62],
       [0.0, 0.0, 0.0, ..., 4.45, 4.49, 2.81],
       [0.0, 1.0, 0.0, ..., 5.73, 5.71, 3.48]], dtype=object)

## Train test split

In [11]:
scores = {}

## Decision Tree Regression

In [12]:
from sklearn.tree import DecisionTreeRegressor

In [13]:
decision_tree = DecisionTreeRegressor()

In [14]:
decision_tree.fit(x, y)

In [15]:
y_pred = decision_tree.predict(x)

In [16]:
from sklearn.metrics import r2_score

In [17]:
scores.update({"decision tree": r2_score(y_true=y, y_pred=y_pred)})
r2_score(y_true=y, y_pred=y_pred)

0.9999843782555866

## Random Forest Regression

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
random_forest = RandomForestRegressor(n_estimators=10)

In [20]:
x = pd.DataFrame(x).dropna().values

In [21]:
random_forest.fit(x, y)

In [22]:
y_pred_2 = random_forest.predict(x)

In [23]:
scores.update({"random forest": r2_score(y_true=y, y_pred=y_pred_2)})
r2_score(y_true=y, y_pred=y_pred_2)

0.9956263944814805

## Support Vector Regression

In [25]:
from sklearn.svm import SVR

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
scale_x = StandardScaler()
scale_y = StandardScaler()

In [28]:
scaled_x = scale_x.fit_transform(x)
scaled_y = scale_y.fit_transform(y.reshape(-1,1))

In [29]:
scaled_x

array([[-0.10271641, -0.25273373, -0.956195  , ...,  1.40135286,
         1.4604562 ,  1.47441384],
       [-0.10271641, -0.25273373, -0.956195  , ...,  2.11343711,
         2.17711958,  2.20018717],
       [-0.10271641, -0.25273373,  1.04581178, ..., -0.02281563,
         0.00898607, -0.04971015],
       ...,
       [-0.10271641, -0.25273373, -0.956195  , ...,  0.00422554,
         0.02712944,  0.12447544],
       [-0.10271641, -0.25273373, -0.956195  , ..., -1.14051749,
        -1.11590329, -1.05127735],
       [-0.10271641,  3.95673339, -0.956195  , ...,  0.01323926,
        -0.00915731, -0.07874109]])

In [30]:
scaled_y

array([[ 2.3919124 ],
       [ 2.33440643],
       [-0.29673958],
       ...,
       [-0.23130176],
       [-0.81503693],
       [-0.42414505]])

In [31]:
svr = SVR(kernel="rbf")

In [None]:
svr.fit(scaled_x, scaled_y)

  y = column_or_1d(y, warn=True)


In [None]:
y_pred_3 = scale_y.inverse_transform(
    svr.predict(
        scale_x.fit_transform(
            x)).reshape(-1,1))

In [None]:
scores.update({"SVR": r2_score(y_pred=y_pred_3, y_true=scale_y.inverse_transform(y))})
r2_score(y_pred=y_pred_3, y_true=scale_y.inverse_transform(y))

In [None]:
plt.plot(range(1, len(y) + 1), scale_y.inverse_transform(y), color="blue")
plt.plot(range(1, len(y) + 1), y_pred_3, color="orange")

## Polynomial Regression

In [24]:
from sklearn.preprocessing import PolynomialFeatures

In [25]:
poly = PolynomialFeatures(degree=2)

In [26]:
x_poly = poly.fit_transform(x)

In [27]:
from sklearn.linear_model import LinearRegression

In [28]:
poly_regressor = LinearRegression()

In [29]:
poly_regressor.fit(x,y)

In [30]:
y_pred_4 = poly_regressor.predict(x)

In [31]:
scores.update({"Polynomial regression": r2_score(y_true=y, y_pred=y_pred_4)})
r2_score(y_true=y, y_pred=y_pred_4)

0.9445187495282248

## Multiple Linear Regression

In [32]:
multiple_regression = LinearRegression()

In [33]:
multiple_regression.fit(x, y)

In [34]:
y_pred_5 = multiple_regression.predict(x)

In [35]:
scores.update({"multiple regression": r2_score(y_pred=y_pred_5, y_true=y)})
r2_score(y_pred=y_pred_5, y_true=y)

0.9445187495282248

## Verdict

In [36]:
scores

{'decision tree': 0.9999843782555866,
 'random forest': 0.9956263944814805,
 'Polynomial regression': 0.9445187495282248,
 'multiple regression': 0.9445187495282248}

In [37]:
max = 0
name = ""
for key in scores:
    if(scores[key] > max):
        max = scores[key]
        name = key
print(f"name : {name}, score : {max}")

name : decision tree, score : 0.9999843782555866


## Test dataset

In [57]:
test_df = pd.read_csv("test.csv")

In [47]:
test_df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,193573,0.35,Ideal,D,VS2,62.3,56.0,4.51,4.54,2.82
1,193574,0.77,Very Good,F,SI2,62.8,56.0,5.83,5.87,3.68
2,193575,0.71,Ideal,I,VS2,61.9,53.0,5.77,5.74,3.55
3,193576,0.33,Ideal,G,VVS2,61.6,55.0,4.44,4.42,2.73
4,193577,1.20,Very Good,I,VS2,62.7,56.0,6.75,6.79,4.24
...,...,...,...,...,...,...,...,...,...,...
129045,322618,0.72,Ideal,D,VVS2,62.0,56.0,5.75,5.78,3.57
129046,322619,0.70,Premium,D,SI1,59.6,62.0,5.77,5.74,3.43
129047,322620,1.01,Premium,G,VVS2,62.3,58.0,6.44,6.41,4.01
129048,322621,1.35,Ideal,D,I1,62.0,56.0,7.05,7.08,4.38


In [58]:
test_df = test_df.iloc[:,1:].values
test_df

array([[0.35, 'Ideal', 'D', ..., 4.51, 4.54, 2.82],
       [0.77, 'Very Good', 'F', ..., 5.83, 5.87, 3.68],
       [0.71, 'Ideal', 'I', ..., 5.77, 5.74, 3.55],
       ...,
       [1.01, 'Premium', 'G', ..., 6.44, 6.41, 4.01],
       [1.35, 'Ideal', 'D', ..., 7.05, 7.08, 4.38],
       [1.07, 'Premium', 'H', ..., 6.49, 6.45, 4.06]], dtype=object)

In [62]:
test_df = ct.fit_transform(test_df)

In [63]:
pred = decision_tree.predict(test_df)

In [69]:
id = pd.read_csv("test.csv").iloc[:,0]

In [70]:
id

0         193573
1         193574
2         193575
3         193576
4         193577
           ...  
129045    322618
129046    322619
129047    322620
129048    322621
129049    322622
Name: id, Length: 129050, dtype: int64

In [84]:
answer = pd.DataFrame([id,pred])

In [85]:
answer

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,129040,129041,129042,129043,129044,129045,129046,129047,129048,129049
id,193573.0,193574.0,193575.0,193576.0,193577.0,193578.0,193579.0,193580.0,193581.0,193582.0,...,322613.0,322614.0,322615.0,322616.0,322617.0,322618.0,322619.0,322620.0,322621.0,322622.0
Unnamed 0,868.0,2375.0,2217.0,730.0,5645.0,605.0,14103.0,3480.0,15287.0,1550.0,...,2705.0,1002.0,3390.0,731.0,4516.0,3729.0,2559.0,7839.0,3328.0,4284.0
