In [None]:
# Importing the libraries

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn import metrics
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

In [None]:
import plotly.express as px 
import plotly.graph_objects as go
import plotly.io as pio
pio.templates

The problem that we are going to solve here is that given a set of features that describe a house in Boston, our machine learning model must predict the house price. To train our machine learning model with boston housing data, we will be using scikit-learn’s boston dataset.

In this dataset, each row describes a boston town or suburb. There are 506 rows and 13 attributes (features) with a target column (price).
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names

### The Boston Housing Dataset

The Boston Housing Dataset is a derived from information collected by the U.S. Census Service concerning housing in the area of Boston MA. The following describes the dataset columns:

CRIM per capita crime rate by town <br>
ZN proportion of residential land zoned for lots over 25,000 sq.ft. <br>
INDUS proportion of non-retail business acres per town <br>
CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) <br>
NOX nitric oxides concentration (parts per 10 million) <br>
RM average number of rooms per dwelling <br>
AGE proportion of owner-occupied units built prior to 1940 <br>
DIS weighted distances to five Boston employment centres <br>
RAD index of accessibility to radial highways <br>
TAX full-value property-tax rate per 10,000usd <br>
PTRATIO pupil-teacher ratio by town <br>
B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town <br>
LSTAT % lower status of the population <br>
MEDV - Median value of owner-occupied homes in $1000's

Each record in the database describes a Boston suburb or town.

In [None]:
#Lets load the dataset 
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

data_check = pd.read_csv(r"C:\Users\sriha\OneDrive\Documents\Desktop\JOB\AI & ML\ML COURSE FOR BEGINNERS FREECODECAMP YT CHANNEL\Notebooks Used in the course\02 My Notebooks\housing.csv",header=None, delimiter=r"\s+", names=column_names)

data_check.head(5)

In [None]:
data_check.sample(5)

In [None]:
#Lets load the dataset 
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'PRICE']

#Changing the name of  target variable to 'PRICE'
# Median value of owner-occupied homes in $1000s

data = pd.read_csv(r"C:\Users\sriha\01 ML Projects\Untitled Folder\housing.csv", header=None, delimiter=r"\s+", names=column_names)

data.head(5)

In [None]:
# Copy the file to back-up file
data_bk = data.copy()

In [None]:
data.columns

In [None]:
data.dtypes

In [None]:
data.shape

In [None]:
# Display the information of the dataset

data.info()

In [None]:
data.describe().T

# EDA

In [None]:
# Identifying the unique number of values in the dataset
data.nunique()

In [None]:
#Checking for missing values
data.isnull().sum()

In [None]:
# See rows with missing values
data[data.isnull().any(axis=1)]

In [None]:
sns.pairplot(data, height=2.5)
plt.tight_layout()


1. **Seaborn Pair Plot:**
   ```python
   sns.pairplot(data, height=2.5)
   ```
   - `sns.pairplot()` is a Seaborn function that creates a grid of scatterplots for all pairs of numerical columns in the DataFrame `data`.
   - The `height=2.5` parameter adjusts the height of each subplot in the grid.

2. **Matplotlib Tight Layout:**
   ```python
   plt.tight_layout()
   ```
   - `plt.tight_layout()` is a Matplotlib function that adjusts the spacing between subplots to improve the layout.

In summary, this code generates a pair plot, which is a grid of scatterplots showing the relationships between pairs of numerical variables in the dataset. Each subplot in the grid represents the relationship between two variables, and the diagonal subplots display histograms for each individual variable.

In [None]:
sns.distplot(data['PRICE']);

1. **Seaborn Distribution Plot:**
   ```python
   sns.distplot(data['PRICE'])
   ```
   - `sns.distplot()` is a Seaborn function that combines a histogram with a kernel density estimate. It visualizes the distribution of a single variable.
   - `data['PRICE']` selects the 'PRICE' column from the DataFrame `data`.

The resulting plot provides a visual representation of the distribution of sale prices in the dataset. The histogram illustrates the frequency or density of different sale price ranges, and the kernel density estimate provides a smoothed representation of the distribution.

In [None]:
print("Skewness: %f" % data['PRICE'].skew())
print("Kurtosis: %f" % data['PRICE'].kurt())


1. **Skewness Calculation:**
   ```python
   print("Skewness: %f" % data['PRICE'].skew())
   ```
   - The `data['PRICE'].skew()` method calculates the skewness of the 'PRICE' column. Skewness measures the asymmetry of the distribution of values. 
   - The result is then printed using the `print` statement.

2. **Kurtosis Calculation:**
   ```python
   print("Kurtosis: %f" % data['PRICE'].kurt())
   ```
   - The `data['PRICE'].kurt()` method calculates the kurtosis of the 'PRICE' column. Kurtosis measures the "tailedness" of the distribution, indicating whether the data has heavy tails or is more peaked than a normal distribution.
   - The result is then printed using the `print` statement.

In summary, these lines of code provide insights into the shape of the distribution of PRICE. A skewness close to zero suggests a relatively symmetric distribution, while positive or negative skewness indicates skew to the right or left, respectively. Kurtosis values are compared to the normal distribution (which has a kurtosis of 3) – higher values indicate heavier tails, and lower values indicate lighter tails.


3. **Skewness:**
   - Skewness is a measure of the asymmetry of a distribution. 
   - If the skewness is close to 0, it indicates that the distribution is approximately symmetric.
   - A positive skewness (greater than 0) suggests that the distribution has a longer right tail, meaning it is skewed to the right.
   - A negative skewness (less than 0) suggests that the distribution has a longer left tail, meaning it is skewed to the left.

4. **Kurtosis:**
   - Kurtosis measures the tails and the peakedness of a distribution.
   - A kurtosis value of 3 is often considered normal (mesokurtic) and is the kurtosis of a normal distribution.
   - Positive kurtosis (greater than 3) indicates heavier tails and a more peaked distribution (leptokurtic).
   - Negative kurtosis (less than 3) indicates lighter tails and a flatter distribution (platykurtic).

Interpreting the results:
- If skewness is close to 0 and kurtosis is close to 3, the distribution of 'PRICE' is approximately normal.
- Positive skewness might suggest that there are more houses with high sale prices.
- Positive kurtosis might suggest that the tails of the distribution are heavier, indicating more extreme values.



In [None]:
fig, ax = plt.subplots()
ax.scatter(x = data['CRIM'], y = data['PRICE'])
plt.ylabel('PRICE', fontsize=13)
plt.xlabel('CRIM', fontsize=13)
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = data['AGE'], y = data['PRICE'])
plt.ylabel('PRICE', fontsize=13)
plt.xlabel('CRIM', fontsize=13)
plt.show()

In [None]:
from scipy import stats
from scipy.stats import norm, skew #for some statistics

sns.distplot(data['PRICE'] , fit=norm);

(mu, sigma) = norm.fit(data['PRICE'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('PRICE distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(data['PRICE'], plot=plt)
plt.show()

This code performs statistical analysis and generates visualizations to examine the distribution of the 'PRICE' column in the Pandas DataFrame named `data`.

1. **Importing Libraries:**
   ```python
   from scipy import stats
   from scipy.stats import norm, skew
   ```
   - This imports the necessary functions from the SciPy library, including statistical tools (`stats`), normal distribution (`norm`), and skewness (`skew`).

2. **Distribution Plot with Fitted Normal Distribution:**
   ```python
   sns.distplot(data['PRICE'], fit=norm);
   ```
   - The Seaborn `distplot` function is used to create a histogram of the 'PRICE' distribution.
   - The `fit=norm` parameter fits a normal distribution to the data and overlays it on the histogram.

3. **Calculating Mean and Standard Deviation:**
   ```python
   (mu, sigma) = norm.fit(data['PRICE'])
   print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
   ```
   - The mean (`mu`) and standard deviation (`sigma`) of the 'PRICE' distribution are calculated using the `norm.fit` function.
   - These values are then printed to the console.

4. **Legend and Plot Customization:**
   ```python
   plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
   plt.ylabel('Frequency')
   plt.title('PRICE distribution')
   ```
   - A legend is added to the plot, indicating the parameters of the fitted normal distribution.
   - The y-axis is labeled as 'Frequency,' and the title of the plot is set to 'PRICE distribution.'

5. **QQ-Plot (Quantile-Quantile Plot):**
   ```python
   fig = plt.figure()
   res = stats.probplot(data['PRICE'], plot=plt)
   plt.show()
   ```
   - A Quantile-Quantile plot (QQ-plot) is created using the `stats.probplot` function from SciPy.
   - The QQ-plot compares the quantiles of the 'PRICE' distribution to the quantiles of a theoretical normal distribution.

In summary, this code aims to analyze and visualize the distribution of 'PRICE,' checking whether it follows a normal distribution. The histogram with a fitted normal distribution provides a visual comparison, and the QQ-plot further assesses the normality assumption. The mean and standard deviation are also calculated and displayed.



6. **Histogram with Fitted Normal Distribution:**
   - The `sns.distplot()` function generates a histogram of the 'PRICE' distribution. The `fit=norm` parameter overlays a fitted normal distribution on the histogram.
   - This visualization helps in assessing how closely the actual distribution aligns with a normal distribution.

7. **Mean and Standard Deviation Calculation:**
   - The mean (`mu`) and standard deviation (`sigma`) of the 'PRICE' distribution are calculated using the `norm.fit` function.
   - These statistical measures provide key summary statistics for understanding the central tendency and spread of the data.

8. **Legend and Plot Customization:**
   - A legend is added to the plot, providing information about the parameters of the fitted normal distribution (mean and standard deviation).
   - The y-axis is labeled as 'Frequency,' and the title of the plot is set to 'PRICE distribution.'
   - These elements enhance the interpretability of the plot.

9. **Quantile-Quantile (QQ) Plot:**
   - The QQ-plot is created using the `stats.probplot` function. It compares the quantiles of the observed 'PRICE' distribution to the quantiles of a theoretical normal distribution.
   - A straight line in the QQ-plot suggests that the data follows a normal distribution. Deviations from the line indicate departures from normality.

The combined use of the histogram, fitted normal distribution, and QQ-plot allows for a comprehensive examination of the 'PRICE' distribution. Deviations from normality might suggest the need for data transformation or consideration of alternative statistical approaches.

Performing statistical analysis and visualizations on the target variable, such as 'PRICE' in this case, is a crucial step in the machine learning (ML) process.

1. **Understanding Data Distribution:**
   - Analyzing the distribution of the target variable helps you understand its underlying patterns and characteristics. This understanding is essential for making informed decisions throughout the ML process.

2. **Normality Assumption:**
   - Many machine learning algorithms assume that the target variable follows a normal distribution. By visualizing the distribution and comparing it to a normal distribution, you can assess whether this assumption holds.

3. **Identifying Skewness:**
   - Skewness, a measure of asymmetry in the distribution, can impact the performance of certain algorithms. Identifying and addressing skewness (if present) through transformations or other techniques can improve model accuracy.

4. **Outlier Detection:**
   - Visualizations, such as the QQ-plot, help in identifying outliers in the target variable. Outliers can have a significant impact on the model, and their detection allows for consideration of appropriate handling strategies.

5. **Feature Engineering:**
   - Understanding the statistical properties of the target variable may guide feature engineering decisions. For example, transformations like log transformations might be applied to achieve a more symmetric distribution.

6. **Model Performance:**
   - The distribution and statistical properties of the target variable can influence the choice of appropriate modeling techniques. Some algorithms work well with normally distributed data, while others are more robust to deviations from normality.

7. **Interpretability and Communication:**
   - Visualizations, such as the histogram and QQ-plot, provide interpretable insights into the target variable's behavior. Communicating these insights to stakeholders is crucial for collaborative decision-making.

8. **Data Preprocessing Decisions:**
   - Findings from the analysis may drive preprocessing decisions, such as handling missing values, imputing outliers, or selecting appropriate transformation techniques.

In summary, the analysis and visualizations performed on the target variable contribute to making informed decisions at various stages of the ML process. They guide preprocessing steps, model selection, and help ensure that the chosen algorithms align with the characteristics of the data. This, in turn, contributes to the development of accurate and robust machine learning models.

In [None]:
data["PRICE"] = np.log1p(data["PRICE"])

sns.distplot(data['PRICE'] , fit=norm);

(mu, sigma) = norm.fit(data['PRICE'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('PRICE distribution')

fig = plt.figure()
res = stats.probplot(data['PRICE'], plot=plt)
plt.show()

This code performs a log transformation on the 'PRICE' column in the Pandas DataFrame named `data` and then visualizes the transformed distribution through statistical analysis

1. **Log Transformation:**
   ```python
   data["PRICE"] = np.log1p(data["PRICE"])
   ```
   - This line applies a log transformation to the 'SalePrice' column using `np.log1p`. Log transformations are often used to address skewness in the data and stabilize variances.

2. **Distribution Plot with Fitted Normal Distribution (After Transformation):**
   ```python
   sns.distplot(data['PRICE'], fit=norm);
   ```
   - The Seaborn `distplot` function creates a histogram of the log-transformed 'PRICE' distribution.
   - The `fit=norm` parameter fits a normal distribution to the transformed data and overlays it on the histogram.

3. **Calculating Mean and Standard Deviation (After Transformation):**
   ```python
   (mu, sigma) = norm.fit(data['PRICE'])
   print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
   ```
   - The mean (`mu`) and standard deviation (`sigma`) of the log-transformed 'PRICE' distribution are calculated using the `norm.fit` function.
   - These values are then printed to the console.

4. **Legend and Plot Customization (After Transformation):**
   ```python
   plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
   plt.ylabel('Frequency')
   plt.title('PRICE distribution')
   ```
   - A legend is added to the plot, indicating the parameters of the fitted normal distribution for the log-transformed data.
   - The y-axis is labeled as 'Frequency,' and the title of the plot is set to 'PRICE distribution.'

5. **QQ-Plot (Quantile-Quantile Plot) After Transformation:**
   ```python
   fig = plt.figure()
   res = stats.probplot(data['PRICE'], plot=plt)
   plt.show()
   ```
   - A QQ-plot is created using the `stats.probplot` function for the log-transformed 'PRICE' distribution.
   - This plot assesses how well the transformed data aligns with a theoretical normal distribution.

In summary, this code performs a log transformation on the 'PRICE' column and then visualizes the distribution of the transformed data. The log transformation is applied to address skewness, and the subsequent analysis checks for improvements in normality and provides insights into the statistical properties of the transformed variable.

The log transformation applied to the 'PRICE' column in the machine learning (ML) process serves several purposes and can bring benefits to the analysis:

1. **Skewness Correction:**
   - The log transformation is often used to mitigate skewness in the distribution of a variable. Skewed distributions can negatively impact the performance of some machine learning algorithms that assume normality or work better with symmetric data. By applying the log transformation, the distribution becomes more symmetrical.

2. **Homoscedasticity Improvement:**
   - Homoscedasticity, which refers to constant variance across the range of the target variable, is an assumption in many regression models. The log transformation can stabilize the variance, particularly when the variance of the variable increases with its level. This can lead to more consistent model performance.

3. **Model Sensitivity Reduction:**
   - Some machine learning models, such as linear regression, are sensitive to the scale and distribution of the target variable. Transformations like the log can reduce the impact of extreme values and outliers, making the model more robust.

4. **Improving Linearity:**
   - Linear models assume a linear relationship between predictors and the target variable. The log transformation can help in achieving a more linear relationship, especially when the target variable exhibits exponential growth.

5. **Handling Multiplicative Effects:**
   - In certain situations where the relationship between predictors and the target variable is multiplicative rather than additive, the log transformation can convert the multiplicative relationship into an additive one, making it more suitable for linear models.

6. **Interpretability Enhancement:**
   - Log transformations can improve the interpretability of the model coefficients. For example, in the context of house prices, a log transformation may correspond to a percentage change in price, which can be more interpretable than a raw price change.

7. **Normality Assumption:**
   - Some algorithms assume that the target variable follows a normal distribution. While the log transformation doesn't guarantee normality, it often helps in making the distribution more normal or approximately normal.

It's important to note that the decision to perform a log transformation depends on the characteristics of the data and the specific requirements of the modeling task. Experimentation and validation are key to determining whether such transformations contribute to the overall improvement of the machine learning model.



# Data Correlation

In [None]:
# Finding out the correlation between the features
corr = data.corr()
corr.shape

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True, cmap=plt.cm.PuBuGn)
plt.show()

# Reference - 1

The provided code generates a heatmap of the correlation between features in the Pandas DataFrame named `data` using the Seaborn and Matplotlib libraries.

```python
# Plotting the heatmap of correlation between features
plt.figure(figsize=(20,20))
sns.heatmap(corr, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='Greens')
```

- This code creates a heatmap using Seaborn's `heatmap` function. Here's a breakdown of the parameters:

  - `plt.figure(figsize=(20,20))`: Sets the size of the Matplotlib figure to 20x20 inches.

  - `sns.heatmap(corr, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='Greens')`:
    - `corr`: The correlation matrix calculated earlier.
    - `cbar=True`: Displays the colorbar on the side of the heatmap.
    - `square=True`: Ensures that the heatmap is square-shaped.
    - `fmt='.1f'`: Formats the numbers in the heatmap to have one decimal place.
    - `annot=True`: Displays the correlation values in each cell of the heatmap.
    - `annot_kws={'size':15}`: Adjusts the font size of the annotation text to 15.
    - `cmap='Greens'`: Specifies the color map to be used for the heatmap (in this case, shades of green).

The resulting heatmap visually represents the correlation matrix, where each cell's color intensity corresponds to the strength and direction of the correlation between the corresponding pair of features. This visualization is useful for identifying patterns and relationships within the dataset.

The heatmap of correlation between features plays a significant role in the machine learning (ML) process, especially during the exploratory data analysis (EDA) phase and feature selection. Here's how it is relevant:

1. **Feature Relationships:**
   - The heatmap visually represents the correlation between different features in the dataset. It helps identify which features have strong positive or negative correlations, providing insights into potential relationships between variables.

2. **Multicollinearity Detection:**
   - High correlations between features may indicate multicollinearity, where two or more features are highly correlated with each other. Multicollinearity can affect the performance of certain ML models, especially linear regression, as it assumes independence between features.

3. **Feature Selection:**
   - Understanding feature correlations is crucial for feature selection. If two features are highly correlated, one of them may be redundant, and removing one can simplify the model without sacrificing much information. This is especially relevant in cases where having too many features can lead to overfitting.

4. **Model Performance:**
   - Correlation analysis can provide insights into which features might be more influential in predicting the target variable. ML models benefit from relevant features that are not highly correlated with each other, leading to better generalization on new data.

5. **Visualization for Interpretability:**
   - Heatmaps offer an intuitive and visual representation of correlations, making it easier for analysts, data scientists, and stakeholders to interpret the relationships within the dataset.

6. **Identifying Patterns:**
   - Patterns in the correlation matrix can reveal interesting insights. For example, a strong negative correlation between two features might indicate an inverse relationship, providing valuable information for understanding the data.

7. **Preprocessing Decisions:**
   - Correlation analysis can influence preprocessing decisions. For instance, if there's a high correlation between two features, you might choose to keep only one of them to simplify the model and reduce the risk of overfitting.

In summary, the heatmap of correlation is a valuable tool in the ML process for understanding feature relationships, detecting multicollinearity, aiding in feature selection, and making informed decisions during data preprocessing. It contributes to building more effective and interpretable machine learning models.


# Reference-2

```python
import matplotlib.pyplot as plt
import numpy as np

cmaps = [('Perceptually Uniform Sequential', [
            'viridis', 'plasma', 'inferno', 'magma', 'cividis']),
         ('Sequential', [
            'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
            'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
            'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn']),
         ('Sequential (2)', [
            'binary', 'gist_yarg', 'gist_gray', 'gray', 'bone', 'pink',
            'spring', 'summer', 'autumn', 'winter', 'cool', 'Wistia',
            'hot', 'afmhot', 'gist_heat', 'copper']),
         ('Diverging', [
            'PiYG', 'PRGn', 'BrBG', 'PuOr', 'RdGy', 'RdBu',
            'RdYlBu', 'RdYlGn', 'Spectral', 'coolwarm', 'bwr', 'seismic']),
         ('Cyclic', ['twilight', 'twilight_shifted', 'hsv']),
         ('Qualitative', [
            'Pastel1', 'Pastel2', 'Paired', 'Accent',
            'Dark2', 'Set1', 'Set2', 'Set3',
            'tab10', 'tab20', 'tab20b', 'tab20c']),
         ('Miscellaneous', [
            'flag', 'prism', 'ocean', 'gist_earth', 'terrain', 'gist_stern',
            'gnuplot', 'gnuplot2', 'CMRmap', 'cubehelix', 'brg',
            'gist_rainbow', 'rainbow', 'jet', 'turbo', 'nipy_spectral',
            'gist_ncar'])]

gradient = np.linspace(0, 1, 256)
gradient = np.vstack((gradient, gradient))


def plot_color_gradients(cmap_category, cmap_list):
    # Create figure and adjust figure height to number of colormaps
    nrows = len(cmap_list)
    figh = 0.35 + 0.15 + (nrows + (nrows-1)*0.1)*0.22
    fig, axs = plt.subplots(nrows=nrows, figsize=(6.4, figh))
    fig.subplots_adjust(top=1-.35/figh, bottom=.15/figh, left=0.2, right=0.99)

    axs[0].set_title(f"{cmap_category} colormaps", fontsize=14)

    for ax, cmap_name in zip(axs, cmap_list):
        ax.imshow(gradient, aspect='auto', cmap=cmap_name)
        ax.text(-.01, .5, cmap_name, va='center', ha='right', fontsize=10,
                transform=ax.transAxes)

    # Turn off *all* ticks & spines, not just the ones with colormaps.
    for ax in axs:
        ax.set_axis_off()


for cmap_category, cmap_list in cmaps:
    plot_color_gradients(cmap_category, cmap_list)
```

In [None]:

cmaps = [('Perceptually Uniform Sequential', [
            'viridis', 'plasma', 'inferno', 'magma', 'cividis']),
         ('Sequential', [
            'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
            'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
            'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn']),
         ('Sequential (2)', [
            'binary', 'gist_yarg', 'gist_gray', 'gray', 'bone', 'pink',
            'spring', 'summer', 'autumn', 'winter', 'cool', 'Wistia',
            'hot', 'afmhot', 'gist_heat', 'copper']),
         ('Diverging', [
            'PiYG', 'PRGn', 'BrBG', 'PuOr', 'RdGy', 'RdBu',
            'RdYlBu', 'RdYlGn', 'Spectral', 'coolwarm', 'bwr', 'seismic']),
         ('Cyclic', ['twilight', 'twilight_shifted', 'hsv']),
         ('Qualitative', [
            'Pastel1', 'Pastel2', 'Paired', 'Accent',
            'Dark2', 'Set1', 'Set2', 'Set3',
            'tab10', 'tab20', 'tab20b', 'tab20c']),
         ('Miscellaneous', [
            'flag', 'prism', 'ocean', 'gist_earth', 'terrain', 'gist_stern',
            'gnuplot', 'gnuplot2', 'CMRmap', 'cubehelix', 'brg',
            'gist_rainbow', 'rainbow', 'jet', 'turbo', 'nipy_spectral',
            'gist_ncar'])]

gradient = np.linspace(0, 1, 256)
gradient = np.vstack((gradient, gradient))


def plot_color_gradients(cmap_category, cmap_list):
    # Create figure and adjust figure height to number of colormaps
    nrows = len(cmap_list)
    figh = 0.35 + 0.15 + (nrows + (nrows-1)*0.1)*0.22
    fig, axs = plt.subplots(nrows=nrows, figsize=(6.4, figh))
    fig.subplots_adjust(top=1-.35/figh, bottom=.15/figh, left=0.2, right=0.99)

    axs[0].set_title(f"{cmap_category} colormaps", fontsize=14)

    for ax, cmap_name in zip(axs, cmap_list):
        ax.imshow(gradient, aspect='auto', cmap=cmap_name)
        ax.text(-.01, .5, cmap_name, va='center', ha='right', fontsize=10,
                transform=ax.transAxes)

    # Turn off *all* ticks & spines, not just the ones with colormaps.
    for ax in axs:
        ax.set_axis_off()


for cmap_category, cmap_list in cmaps:
    plot_color_gradients(cmap_category, cmap_list)


In [None]:
cor_target = abs(corr["PRICE"]) # absolute value of the correlation 

relevant_features = cor_target[cor_target>0.2] # highly correlated features 

names = [index for index, value in relevant_features.iteritems()] # getting the names of the features 

names.remove('PRICE') # removing target feature 

print(names) # printing the features 
print(len(names))

This code performs feature selection based on the absolute correlation coefficients between the features and the target variable ('PRICE').

1. **Calculating Absolute Correlation:**
   ```python
   cor_target = abs(cor["PRICE"])
   ```
   - This line calculates the absolute correlation coefficients between each feature and the target variable ('PRICE').

2. **Selecting Highly Correlated Features:**
   ```python
   relevant_features = cor_target[cor_target > 0.2]
   ```
   - The code selects features that have an absolute correlation coefficient greater than 0.2 with the target variable. The threshold of 0.2 is chosen to identify features that have a relatively strong correlation with the target.

3. **Getting Feature Names:**
   ```python
   names = [index for index, value in relevant_features.iteritems()]
   ```
   - This line extracts the names of the features that meet the correlation threshold. It uses a list comprehension to iterate over the items in the `relevant_features` series and retrieves the feature names.

4. **Removing Target Feature:**
   ```python
   names.remove('PRICE')
   ```
   - The code removes the target feature ('PRICE') from the list of selected feature names since the target itself is not considered as a predictor.

5. **Printing Selected Features and Count:**
   ```python
   print(names)
   print(len(names))
   ```
   - Finally, the code prints the names of the selected features and the count of features selected based on the correlation threshold.

In summary, this code is a feature selection step that identifies features with a relatively strong absolute correlation with the target variable ('PRICE'). The selected features are printed, and the count of selected features is also displayed.

# Model Building

In [None]:
# Identify the independent and Target variables

IndepVar = []
for col in data.columns:
    if col != 'PRICE':
        IndepVar.append(col)

TargetVar = 'PRICE'

x = data[IndepVar]
y = data[TargetVar]

In [None]:
x.head()

In [None]:
y.head()

In [None]:
# Split the data into train and test

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=4)

# Display the shape of the train_data and test_data

x_train.shape, y_train.shape, x_test.shape, y_test.shape

# Linear Regression

In [None]:
# Load the result dataset

RGRResults_train = pd.read_csv(r"C:\Users\sriha\OneDrive\Documents\Desktop\JOB\AI & ML\ML COURSE FOR BEGINNERS FREECODECAMP YT CHANNEL\Notebooks Used in the course\02 My Notebooks\RGRResults_train.csv", header=0)

RGRResults_train.head()

# For train dataset

In [None]:
# Build the multi regression model

from sklearn.linear_model import LinearRegression  

# Create object for the model

ModelMLR = LinearRegression()

# Train the model with training data

ModelMLR.fit(x_train, y_train)

# Predict the model with train dataset

y_pred = ModelMLR.predict(x_train)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_train, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_train, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_train, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_train, y_pred),6))
#print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_train, y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_train, y_pred): 
    y_train, y_pred = np.array(y_train), np.array(y_pred)
    return np.mean(np.abs((y_train - y_pred) / y_train)) * 100

# Evaluation of MAPE 

result = MAPE(y_train, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_train, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)
print('------------------------------------------------------------------------------------------------------------')
#-------------------------------------------------------------------------------------------
new_row = {'Model Name' : ModelMLR,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_train, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_train, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_train, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_train, y_pred))),
               'R2_score' : metrics.r2_score(y_train, y_pred)}
RGRResults_train = RGRResults_train.append(new_row, ignore_index=True)
#-------------------------------------------------------------------------------------------

In [None]:
RGRResults_train

In [None]:
Results = pd.DataFrame({'PRICE_A':y_train, 'PRICE_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = data.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

In [None]:
# Calculate the %of Error

ResultsFinal['%Error'] = round(((ResultsFinal['PRICE_A']-ResultsFinal['PRICE_P'])/ResultsFinal['PRICE_A'])*100,3)

In [None]:
# Display the results

ResultsFinal.head(10)

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()

In [None]:
# Checking residuals
plt.scatter(y_pred,y_train-y_pred)
plt.title("Predicted vs residuals")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.show()

In [None]:
# Checking Normality of errors
sns.distplot(y_train-y_pred)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Build the Regression / Regressor models


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge

from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

ModelDCR = DecisionTreeRegressor()
ModelRFR = RandomForestRegressor()
ModelETR = ExtraTreesRegressor()
ModelBRR = BayesianRidge()

ModelSVR = SVR()
modelXGR = xgb.XGBRegressor()
ModelKNN = KNeighborsRegressor(n_neighbors=5)
modelBRR = BayesianRidge()
modelBGR = BaggingRegressor()
modelGBR = GradientBoostingRegressor(loss='squared_error', learning_rate=0.1, n_estimators=100, subsample=1.0,
                                     criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                     init=None, random_state=None, max_features=None,
                                     alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
                                     validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)


# Evalution matrix for all the algorithms

#MM = [modelmlg, modeldcr, modelrfr, modelSVR, modelXGR, modelKNN, modelETR, modelBRR, modelBGR, modelGBR]
MM = [ModelDCR, ModelRFR, ModelETR, ModelBRR, ModelSVR,modelXGR,ModelKNN,modelBRR,modelBGR,modelGBR]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with train dataset

    y_pred = models.predict(x_train)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_train, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_train, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_train, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_train, y_pred),6))
    #print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_train, y_pred))),3))

    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_train, y_pred): 
        y_train, y_pred = np.array(y_train), np.array(y_pred)
        return np.mean(np.abs((y_train - y_pred) / y_train)) * 100

    # Evaluation of MAPE 

    result = MAPE(y_train, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_train, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
                   'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_train, y_pred),
                   'Adj_R_Square' : adjusted_r_squared,
                   'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_train, y_pred)),
                   'Mean_Absolute_Percentage_Error_MAPE' : result,
                   'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_train, y_pred),
                   'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_train, y_pred))),
                   'R2_score' : metrics.r2_score(y_train, y_pred)}
    RGRResults_train = RGRResults_train.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------------

In [None]:
RGRResults_train

# For Test Data

In [None]:
# Load the result dataset

RGRResults_test = pd.read_csv(r"C:\Users\sriha\OneDrive\Documents\Desktop\JOB\AI & ML\ML COURSE FOR BEGINNERS FREECODECAMP YT CHANNEL\Notebooks Used in the course\02 My Notebooks\RGRResults_test.csv", header=0)

RGRResults_test.head()

In [None]:
# Build the multi regression model

from sklearn.linear_model import LinearRegression  

# Create object for the model

ModelMLR = LinearRegression()

# Train the model with training data

ModelMLR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = ModelMLR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
#print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
print('Mean Absolute Percentage Error (MAPE):', round(metrics.mean_absolute_percentage_error(y_test, y_pred)*100,3), '%')
# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)
print('------------------------------------------------------------------------------------------------------------')
#-------------------------------------------------------------------------------------------
new_row = {'Model Name' : ModelMLR,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
RGRResults_test = RGRResults_test.append(new_row, ignore_index=True)
#-------------------------------------------------------------------------------------------

In [None]:
RGRResults_test

In [None]:
Results = pd.DataFrame({'PRICE_A':y_test, 'PRICE_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = data.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

In [None]:
ResultsFinal.shape

In [None]:
# Calculate the %of Error

ResultsFinal['%Error'] = round(((ResultsFinal['PRICE_A']-ResultsFinal['PRICE_P'])/ResultsFinal['PRICE_A'])*100,3)

In [None]:
# Display the results

ResultsFinal.head(10)

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()

In [None]:
# Checking residuals
plt.scatter(y_pred,y_test-y_pred)
plt.title("Predicted vs residuals")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.show()

In [None]:
# Checking Normality of errors
sns.distplot(y_test-y_pred)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

### Comparision with different Algorithms

In [None]:
# Build the Regression / Regressor models

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge

from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

ModelDCR = DecisionTreeRegressor()
ModelRFR = RandomForestRegressor()
ModelETR = ExtraTreesRegressor()
ModelBRR = BayesianRidge()

ModelSVR = SVR()
modelXGR = xgb.XGBRegressor()
ModelKNN = KNeighborsRegressor(n_neighbors=5)
modelBRR = BayesianRidge()
modelBGR = BaggingRegressor()
modelGBR = GradientBoostingRegressor(loss='squared_error', learning_rate=0.1, n_estimators=100, subsample=1.0,
                                     criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                     init=None, random_state=None, max_features=None,
                                     alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
                                     validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)


# Evalution matrix for all the algorithms

#MM = [modelmlg, modeldcr, modelrfr, modelSVR, modelXGR, modelKNN, modelETR, modelBRR, modelBGR, modelGBR]
MM = [ModelDCR, ModelRFR, ModelETR, ModelBRR, ModelSVR,modelXGR,ModelKNN,modelBRR,modelBGR,modelGBR]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    RGRResults_test = RGRResults_test.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------------

In [None]:
RGRResults_test