In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline





In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('../input/diamonds/diamonds.csv')

In [None]:
data.head()

In [None]:
data.drop('Unnamed: 0', axis=1, inplace=True)
data.columns = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'length', 'width', 'height']
data.columns

In [None]:
data.shape

In [None]:
data.head()

Data Preprocessing

In [None]:

data.columns = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'length', 'width', 'height']
data.columns

In [None]:
data.info()

In [None]:
data.describe(include='all')

In [None]:
data = data.drop(data[data["length"]==0].index)
data = data.drop(data[data["width"]==0].index)
data = data.drop(data[data["height"]==0].index)
data.shape

In [None]:
sns.set_palette('coolwarm')

In [None]:
fig, ax = plt.subplots(1,2, figsize=(13,5))
sns.histplot(data['carat'], kde=True, ax=ax[0], color='#21cc62')
ax[0].set_title('Distribution of carat')
sns.boxplot(y=data['carat'], ax=ax[1], color='#21cc62')
ax[1].set_title('Distribution of carat')
plt.show()

In [None]:
fig = sns.countplot(x=data['cut'])
plt.title('Distribution of cut')
plt.show()

fig = sns.countplot(x=data['color'])
plt.title('Distribution of color')
plt.show()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(13,5))
sns.histplot(data['height'], kde=True, ax=ax[0], color='#b51cd4')
ax[0].set_title('Distribution of height')
sns.boxplot(y=data['height'], ax=ax[1], color='#b51cd4')
ax[1].set_title('Distribution of height')
plt.show()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(13,5))
sns.histplot(data['length'], kde=True, ax=ax[0], color='#b51cd4')
ax[0].set_title('Distribution of length')
sns.boxplot(y=data['length'], ax=ax[1], color='#b51cd4')
ax[1].set_title('Distribution of length')
plt.show()

* There are clearly outliers to be seen depth, table, length, width and height columns.

In [None]:
outlier_cols = ['depth', 'table', 'length', 'width', 'height']

In [None]:
total_outliers = 0

for col in outlier_cols:  
    print(f'\nColumn Name: {col}')
    
    Q1 = np.percentile(data[col], 25, interpolation = 'midpoint')
    Q3 = np.percentile(data[col], 75, interpolation = 'midpoint')
    IQR = Q3 - Q1

    print(f'Q1: {Q1} | Q2: {Q3} | IQR: {IQR}')
    
    upper_bound = Q3+1.5*IQR
    lower_bound = Q1-1.5*IQR
    print(f'upper bound: {upper_bound}')
    print(f'lower bound: {lower_bound}')

    # Upper bound
    upper = data[data[col] >= upper_bound]
    # Lower bound
    lower = data[data[col] <= lower_bound]
    
    
   
    no_of_outliers = len(upper) + len(lower)
    print(f"Outliers Count :  {no_of_outliers}\n")
    total_outliers += no_of_outliers
    
    # Remove Outlier
    data = data[(data[col]<upper_bound) & (data[col]>lower_bound)]

    print(f'{no_of_outliers} Outliers removed from {col} column.\n')
    
    print(f'\n\nTotal outliers removed are {total_outliers}.')

In [None]:
#data shape
data.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
import pickle

label_data = data.copy()

cut_label_encoder = LabelEncoder()
label_data['cut'] = cut_label_encoder.fit_transform(label_data['cut'])
cut_encoder = open('cut_encoder.pkl', 'wb')
pickle.dump(cut_label_encoder, cut_encoder)
cut_encoder.close()

color_label_encoder = LabelEncoder()
label_data['color'] = color_label_encoder.fit_transform(label_data['color'])
color_encoder = open('color_encoder.pkl', 'wb')
pickle.dump(color_label_encoder, color_encoder)
color_encoder.close()

clarity_label_encoder = LabelEncoder()
label_data['clarity'] = clarity_label_encoder.fit_transform(label_data['clarity'])

clarity_encoder = open('clarity_encoder.pkl', 'wb')
pickle.dump(clarity_label_encoder, clarity_encoder)
clarity_encoder.close()
label_data.head()


# Splitting the Data

In [None]:
X = label_data.drop('price', axis=1)
y = label_data['price']

Linear Regression


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
models = {'model_name':[], 'model':[], 'cv_score':[], 'accuracy':[]}


In [None]:
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression()
#training
linear_regression.fit(X_train, y_train)
#prediction
predict_y = linear_regression.predict(X_test)


#model cross validation score (negative root mean squared error)
cv_score = cross_val_score(linear_regression, X_train, y_train, scoring='neg_root_mean_squared_error', cv=15).mean()

#model accuracy on test data
accuracy = round(r2_score(y_test, predict_y)*100,2)


models['model_name'].append('Linear Regression')
models['model'].append(linear_regression)
models['model'].append(linear_regression)
models['cv_score'].append(-cv_score)
models['accuracy'].append(accuracy)

print('Model: Linear Regression')
print('Cross Validation Score: ', -cv_score)
print('Accuracy', accuracy)

Decision Tree Regressor 

In [None]:
from sklearn.tree import DecisionTreeRegressor
decision_tree_regression = DecisionTreeRegressor()
decision_tree_regression.fit(X_train, y_train)

predict_y = decision_tree_regression.predict(X_test)
cv_score = cross_val_score(decision_tree_regression, X_train, y_train, scoring='neg_root_mean_squared_error', cv=15).mean()
accuracy = round(r2_score(y_test, predict_y)*100,2)


models['model_name'].append('Decision Tree Regression')
models['model'].append(decision_tree_regression)
models['cv_score'].append(-cv_score)
models['accuracy'].append(accuracy)

print('Model: Desision Tree Regression')
print('Cross Validation Score: ', -cv_score)
print('Accuracy', accuracy)


SVM

Random Forest Regressor 

In [None]:
from sklearn.svm import SVR
#model
support_vector_regression = SVR()
#training
support_vector_regression.fit(X_train, y_train)
#prediction
predict_y = support_vector_regression.predict(X_test)

#model cross validation score (negative root mean squared error)
cv_score = cross_val_score(support_vector_regression, X_train, y_train, scoring='neg_root_mean_squared_error', cv=15).mean()

#model accuracy on test data
accuracy = round(r2_score(y_test, predict_y)*100,2)


models['model_name'].append('Support Vector Regression')
models['model'].append(support_vector_regression)
models['cv_score'].append(-cv_score)
models['accuracy'].append(accuracy)
print('Model: Support Vector Regression')
print('Cross Validation Score: ', -cv_score)
print('Accuracy', accuracy)


Grading Boost Regressor 

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
#model
gradient_boosting_regression = GradientBoostingRegressor()
#training
gradient_boosting_regression.fit(X_train, y_train)
#prediction
predict_y = gradient_boosting_regression.predict(X_test)

#model cross validation score (negative root mean squared error)
cv_score = cross_val_score(gradient_boosting_regression, X_train, y_train, scoring='neg_root_mean_squared_error', cv=15).mean()

#model accuracy on test data
accuracy = round(r2_score(y_test, predict_y)*100,2)


models['model_name'].append('Gradient Boosting Regression')
models['model'].append(gradient_boosting_regression)
models['cv_score'].append(-cv_score)
models['accuracy'].append(accuracy)



print('Model: Gradient Boosting Regression')
print('Cross Validation Score: ', -cv_score)
print('Accuracy', accuracy)



In [None]:
fig = plt.figure(figsize=(12,5))
plots = sns.barplot(y=models['model_name'], x=models['accuracy'])

In [None]:
def predict_output(input_val):
    
    output_val = []
    
    pkl_file = open('cut_encoder.pkl', 'rb')
    cut_input_encoder = pickle.load(pkl_file) 
    pkl_file.close()

    pkl_file = open('color_encoder.pkl', 'rb')
    color_input_encoder = pickle.load(pkl_file) 
    pkl_file.close()

    pkl_file = open('clarity_encoder.pkl', 'rb')
    clarity_input_encoder = pickle.load(pkl_file) 
    pkl_file.close()

    input_val[1] = cut_input_encoder.transform([input_val[1]])[0]
    input_val[2] = color_input_encoder.transform([input_val[2]])[0] 
    input_val[3] = clarity_input_encoder.transform([input_val[3]])[0]
    
    for output_model_name,output_model in zip(models['model_name'],models['model']):
        output_prediction = round(output_model.predict([input_val])[0],2)
        print(f'{output_model_name} : {output_prediction}')
        output_val.append(output_prediction)

    fig = plt.figure(figsize=(12,5))
    plots = sns.barplot(y=models['model_name'], x=output_val)

In [None]:
input_val = [0.5, 'Fair', 'D', 'IF', 62.9, 56.8, 7.8, 45.8, 26.8]
    
predict_output(input_val)
