In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train_set=pd.read_csv('train_set.csv')
test_set=pd.read_csv('test_set.csv')

In [3]:
train_set.isna().sum()

Id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [4]:
diamond= train_set.drop("price", axis=1)
diamond_labels = train_set["price"].copy()

In [5]:
from sklearn.preprocessing import OrdinalEncoder


from sklearn.preprocessing import StandardScaler

In [6]:
from sklearn.compose import ColumnTransformer
cat_col =list(diamond[['cut','color','clarity']])
num_col =list(diamond[['table','carat','depth','x','y','z']])


In [7]:
from sklearn.pipeline import Pipeline


In [8]:
num_pipeline=Pipeline([("standardize", StandardScaler()),])
cat_pipeline=Pipeline([('Ordinally',OrdinalEncoder()),])

In [9]:
full_pipeline=ColumnTransformer([
 ("num", num_pipeline, num_col),
 ("cat", cat_pipeline, cat_col),])

In [10]:
diamond=full_pipeline.fit_transform(diamond)

In [11]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(diamond,diamond_labels)

LinearRegression()

In [12]:
from sklearn.metrics import mean_squared_error

diamond_predictions = lin_reg.predict(diamond)
lin_mse = mean_squared_error(diamond_labels, diamond_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

1345.287570044896


In [13]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state = 42)
tree_reg.fit(diamond,diamond_labels)

DecisionTreeRegressor(random_state=42)

In [14]:
diamond_predictions = tree_reg.predict(diamond)
tree_mse = mean_squared_error(diamond_labels,diamond_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

8.613423979024963

In [15]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, diamond, diamond_labels, scoring ="neg_mean_squared_error",cv = 10)
tree_rmse_scores = np.sqrt(-scores)

In [16]:
print("Scores: ", tree_rmse_scores)
print("Mean: ", tree_rmse_scores.mean())
print("Standard Deviation: ", tree_rmse_scores.std())

Scores:  [767.54666895 754.68510368 742.70414464 762.28119038 766.30711042
 758.0903989  733.1326257  731.11509216 716.50216499 799.80211537]
Mean:  753.2166615192614
Standard Deviation:  22.372536573913717


In [17]:
scores = cross_val_score(lin_reg, diamond, diamond_labels, scoring ="neg_mean_squared_error",cv = 10)
lin_rmse_scores = np.sqrt(-scores)

In [18]:
print("Scores: ", lin_rmse_scores)
print("Mean: ", lin_rmse_scores.mean())
print("Standard Deviation: ", lin_rmse_scores.std())

Scores:  [1331.50344025 1338.97749839 1374.8139444  1300.05128573 1378.51403463
 1331.29673722 1399.63831318 1313.68770208 1351.5557633  1354.82560624]
Mean:  1347.4864325401645
Standard Deviation:  29.124144870859627


In [19]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg,diamond, diamond_labels,scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-forest_scores)

In [20]:
print("Scores: ", forest_rmse_scores)
print("Mean: ", forest_rmse_scores.mean())
print("Standard Deviation: ", forest_rmse_scores.std())

Scores:  [571.37041187 544.36800889 568.80157762 530.72955671 571.31531491
 572.50655343 548.9909854  547.28175163 520.90684232 567.53932827]
Mean:  554.3810331061188
Standard Deviation:  17.73319786238745


In [21]:
from sklearn.model_selection import GridSearchCV

param_grid =[
              {'n_estimators': [3,10,30], 'max_features':[2,4,6,8]},
              {'bootstrap':[False], 'max_features':[2,3,4],'n_estimators':[3,10]}
]
forest_reg = RandomForestRegressor(random_state = 42)

grid_search = GridSearchCV(forest_reg,param_grid,cv = 5, scoring = 'neg_mean_squared_error',return_train_score = True)
grid_search.fit(diamond, diamond_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [22]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [23]:
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30, random_state=42)

In [24]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"],cvres["params"]):
  print(np.sqrt(-mean_score), params)

830.4861857690995 {'max_features': 2, 'n_estimators': 3}
712.5106275085241 {'max_features': 2, 'n_estimators': 10}
671.4995998324283 {'max_features': 2, 'n_estimators': 30}
698.9760454845787 {'max_features': 4, 'n_estimators': 3}
614.8209844947058 {'max_features': 4, 'n_estimators': 10}
593.389021593397 {'max_features': 4, 'n_estimators': 30}
658.1156995208488 {'max_features': 6, 'n_estimators': 3}
588.5853980646136 {'max_features': 6, 'n_estimators': 10}
570.6115276008732 {'max_features': 6, 'n_estimators': 30}
653.109073758558 {'max_features': 8, 'n_estimators': 3}
591.8720522819708 {'max_features': 8, 'n_estimators': 10}
572.810032867276 {'max_features': 8, 'n_estimators': 30}
791.7789206700502 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
691.1083601410246 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
715.4098898223756 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
626.4257802991623 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
6

In [25]:
tcat_col =list(test_set[['cut','color','clarity']])
tnum_col =list(test_set[['table','carat','depth','x','y','z']])


In [26]:
tnum_pipeline=Pipeline([("standardize", StandardScaler()),])
tcat_pipeline=Pipeline([('Ordinally',OrdinalEncoder()),])

In [27]:
tfull_pipeline=ColumnTransformer([
 ("num", num_pipeline, num_col),
 ("cat", cat_pipeline, cat_col),])

In [28]:
test_set=tfull_pipeline.fit_transform(test_set)

In [29]:
forest_reg.fit(diamond,diamond_labels)
predications=forest_reg.predict(test_set)

In [30]:
test_set=pd.DataFrame(test_set)

In [36]:

test=pd.read_csv('test_set.csv')
test

Unnamed: 0,Id,carat,cut,color,clarity,depth,table,x,y,z
0,1,0.34,Ideal,G,VVS2,61.1,57.0,4.52,4.48,2.75
1,2,0.71,Premium,E,VS2,62.7,58.0,5.74,5.68,3.58
2,3,0.44,Very Good,I,VS1,62.8,56.0,4.83,4.88,3.05
3,4,0.81,Premium,E,SI2,60.1,59.0,6.09,6.03,3.65
4,5,0.40,Ideal,G,VVS1,61.2,56.0,4.74,4.80,2.92
...,...,...,...,...,...,...,...,...,...,...
10783,10784,0.57,Ideal,H,VS1,60.9,56.0,5.34,5.36,3.26
10784,10785,1.05,Ideal,G,VS2,60.8,57.0,6.65,6.58,4.02
10785,10786,0.71,Ideal,E,VVS1,62.3,55.0,5.68,5.72,3.55
10786,10787,1.11,Premium,E,SI2,61.0,60.0,6.68,6.66,4.07


In [38]:
submission=test[['Id']]
submission

Unnamed: 0,Id
0,1
1,2
2,3
3,4
4,5
...,...
10783,10784
10784,10785
10785,10786
10786,10787


In [40]:
submission['price']=predications

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['price']=predications


In [42]:
submission.to_csv('submission.csv',index=None)