In [4]:
%%time

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')


df=sns.load_dataset('diamonds')
X=df.drop(columns=['price'])

Y=df['price']

numeric_features=['carat','depth','table','x','y','z']
categorical_features=['cut','color','clarity']

preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numeric_features),
        ('cat',OneHotEncoder(),categorical_features)
    ]
)


X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)


pipline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',LinearRegression())
])

pipline.fit(X_train,Y_train)

y_pred=pipline.predict(X_test)

print("Mean Absolute Error:",mean_absolute_error(Y_test,y_pred))
print("Mean Squared Error:",mean_squared_error(Y_test,y_pred))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(Y_test,y_pred)))
print("R2 Score:",r2_score(Y_test,y_pred))



Mean Absolute Error: 737.1513665933285
Mean Squared Error: 1288705.4778516756
Root Mean Squared Error: 1135.2116445190632
R2 Score: 0.9189331350419387
CPU times: total: 562 ms
Wall time: 699 ms


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [5]:
df['price'].describe()

count    53940.000000
mean      3932.799722
std       3989.439738
min        326.000000
25%        950.000000
50%       2401.000000
75%       5324.250000
max      18823.000000
Name: price, dtype: float64

### Improving the Model and Enhancing Visualization

To improve the model's performance and make it more robust, consider the following steps:

#### 1. **Feature Engineering**
    - **Interaction Features**: Create new features by combining existing ones (e.g., `carat * depth`).
    - **Polynomial Features**: Add polynomial terms to capture non-linear relationships.
    - **Log Transformation**: Apply log transformation to skewed features like `price` to normalize the data.

#### 2. **Hyperparameter Tuning**
    - Use techniques like Grid Search or Randomized Search to find the best hyperparameters for the model.
    - Experiment with different regression models (e.g., Ridge, Lasso, or Decision Trees).

#### 3. **Cross-Validation**
    - Implement k-fold cross-validation to ensure the model generalizes well to unseen data.

#### 4. **Handling Outliers**
    - Use visualization tools like boxplots to identify outliers and decide whether to remove or transform them.

#### 5. **Feature Selection**
    - Use techniques like Recursive Feature Elimination (RFE) to select the most important features.

#### 6. **Regularization**
    - Apply regularization techniques (e.g., L1 or L2) to prevent overfitting.

---

### Enhancing Visualization

To make the visualizations more appealing and informative, follow these tips:

#### 1. **Use Color Palettes**
    - Utilize Seaborn's built-in color palettes (e.g., `sns.color_palette("coolwarm")`) to make plots visually appealing.

#### 2. **Add Titles and Labels**
    - Ensure every plot has a descriptive title, axis labels, and a legend if applicable.

#### 3. **Annotations**
    - Annotate key points in the plot to highlight important insights.

#### 4. **Gridlines**
    - Add gridlines to improve readability.

#### 5. **Customizing Styles**
    - Use Seaborn themes like `sns.set_style("whitegrid")` or `sns.set_context("talk")` for a polished look.

#### Example Code for Beautiful Visualization:
```python
sns.set_style("whitegrid")
sns.set_palette("coolwarm")

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='carat', y='price', hue='cut', alpha=0.7)
plt.title("Carat vs Price by Cut", fontsize=16)
plt.xlabel("Carat", fontsize=12)
plt.ylabel("Price", fontsize=12)
plt.legend(title="Cut", fontsize=10)
plt.show()
```

In [2]:
%%time

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')


df=sns.load_dataset('diamonds')
X=df.drop(columns=['price'])

Y=df['price']

numeric_features=['carat','depth','table','x','y','z']
categorical_features=['cut','color','clarity']

preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numeric_features),
        ('cat',OneHotEncoder(),categorical_features)
    ]
)


X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)


pipline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',DecisionTreeRegressor())
])

pipline.fit(X_train,Y_train)

y_pred=pipline.predict(X_test)

print("Mean Absolute Error:",mean_absolute_error(Y_test,y_pred))
print("Mean Squared Error:",mean_squared_error(Y_test,y_pred))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(Y_test,y_pred)))
print("R2 Score:",r2_score(Y_test,y_pred))



Mean Absolute Error: 358.96069707081944
Mean Squared Error: 547269.430524657
Root Mean Squared Error: 739.77660852764
R2 Score: 0.9655736568343168
CPU times: total: 656 ms
Wall time: 681 ms


In [None]:
%%time

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')


df=sns.load_dataset('diamonds')
X=df.drop(columns=['price'])

Y=df['price']

numeric_features=['carat','depth','table','x','y','z']
categorical_features=['cut','color','clarity']

preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numeric_features),
        ('cat',OneHotEncoder(),categorical_features)
    ]
)


X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)


pipline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',RandomForestRegressor())
])

pipline.fit(X_train,Y_train)

y_pred=pipline.predict(X_test)

print("Mean Absolute Error:",mean_absolute_error(Y_test,y_pred))
print("Mean Squared Error:",mean_squared_error(Y_test,y_pred))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(Y_test,y_pred)))
print("R2 Score:",r2_score(Y_test,y_pred))



Mean Absolute Error: 270.3982772436746
Mean Squared Error: 303496.1328643975
Root Mean Squared Error: 550.9048310410769
R2 Score: 0.9809083763194466
CPU times: total: 30.3 s
Wall time: 34.9 s


## Decision Tree

- Mean Absolute Error: 358.96069707081944
- Mean Squared Error: 547269.430524657
- Root Mean Squared Error: 739.77660852764
- R2 Score: 0.9655736568343168
- CPU times: total: 656 ms
- Wall time: 681 ms

## Multinear Regression

- Mean Absolute Error: 737.1513665933285
- Mean Squared Error: 1288705.4778516756
- Root Mean Squared Error: 1135.2116445190632
- R2 Score: 0.9189331350419387
- CPU times: total: 562 ms
- Wall time: 699 ms

In [4]:
%%time

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')


df=sns.load_dataset('diamonds')
X=df.drop(columns=['price'])

Y=df['price']

numeric_features=['carat','depth','table','x','y','z']
categorical_features=['cut','color','clarity']

preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numeric_features),
        ('cat',OneHotEncoder(),categorical_features)
    ]
)


X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)


pipline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',XGBRegressor())
])

pipline.fit(X_train,Y_train)

y_pred=pipline.predict(X_test)

print("Mean Absolute Error:",mean_absolute_error(Y_test,y_pred))
print("Mean Squared Error:",mean_squared_error(Y_test,y_pred))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(Y_test,y_pred)))
print("R2 Score:",r2_score(Y_test,y_pred))



Mean Absolute Error: 285.61346435546875
Mean Squared Error: 318286.3125
Root Mean Squared Error: 564.1686915276316
R2 Score: 0.9799779653549194
CPU times: total: 2.62 s
Wall time: 2.37 s
