### Import required libraries

In [None]:
# Libraries to process file and visualization for EDA
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Outlier detection 
from scipy.stats import zscore

%matplotlib inline
sns.set(style='white', context='notebook')

In [None]:
# (style='white', context='notebook')

In [None]:
# Get current working directory
# print(os.getcwd())
# Change current working directory pointing to dataset
# os.chdir(r"../data/project/regression")

### Import Dataset

In [None]:
# Import dataset file and get panda dataframe. Also copy original dataframe to future reference.
dataset = pd.read_csv(r"USA_Housing.csv")
dataset_original = dataset

In [None]:
# Check few columns and related data values 
dataset.head()

In [None]:
# Check all available columns
dataset.columns

In [None]:
# Check all column datatypes also observe Non-Null values
dataset.info()

# Observation : There are total 5000 records available. 
    # Other than 'Address' column all columns are numerical having datatype float64
    # and We can see there are some missing values Avg. Area Income, Avg. Area Number of Rooms and 
# Avg. Area Number of Bedrooms columns

In [None]:
# Drop unnecessary feature(s)
# dataset = dataset.iloc[:, 0:6]
dataset = dataset.drop(columns=['Address'], axis=1) 

### Checking missing values

In [None]:
# Get total missing values and it's percentages
total_missing = dataset.isnull().sum()
percentage_missing = total_missing * 100 / len(dataset)
missing_value_df = pd.DataFrame(data=[total_missing, percentage_missing], index=["Total", "%"]).T
missing_value_df

# Observation : Here we do have missing values in dataset but no colum have missing values 25% or greater so no need to 
# drop any column but we will do imputation.
                                    
# In imputation if variable is char then we will use mode function and if it is numeric then we will first check for
# outrliers if there are outrliers then we will impute by median else we will impute using mean

In [None]:
# If MEAN and MEDIAN difference is more than 10% then there are possible outliers present in data.
dataset.describe().T


# Observation : As there is not much MEAN and MEDIAN difference for each column so there are less or no outliers

### Univirate Analysis

In [None]:
### Draw Scatter plot for numerical columns And analyse the distribution 
for column in dataset.columns :
    plt.figure()
    sns.scatterplot(data=dataset, x=column, y=dataset.index,hue=dataset.Price)

In [None]:
### Draw hist plot for numerical columns And analyse the distribution 
for column in dataset.columns : 
    plt.figure()
    sns.displot(dataset[column],color='darkorange',kind='kde')
#     sns.displot(dataset[column], color='darkorange', kind='hist')
    
# Observation : Here we can see other than column 'Avg. Area Number of Bedrooms' 
# all columns seems to be Normally/Gaussian/Symetrically distributed.

In [None]:
### Draw box plot 
# Identify Outliers  for numerical columns. It represents the five-point summary. 
for column in dataset.columns :
    plt.figure()
    sns.boxplot(x = dataset[column], data = dataset, hue=column)
    
# Observation : There are outliers in columns 'Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
#  'Avg. Area Number of Bedrooms', 'Area Population' and 'Price'

### Bi-virate analysis

In [None]:
# Visualize relation with each independend and dependent variable
sns.pairplot(data=dataset)

In [None]:
# Calculate co-relation
corr = dataset.corr()
corr

In [None]:
# plot co-relation in heat map to check multi-co-linearility between features
plt.subplots(figsize=(8,8))
sns.heatmap(corr,cmap= 'RdYlGn',annot=True)
plt.show()

### Pre-processing

In [None]:
# Check Skewness and Kurtosisto check if data require scaling/transformation or not.

skew = dataset.skew()
print("\nSkew\n")
print(skew)

print("\nKurt\n")
kurt = dataset.kurt()
print(kurt)

# Observation : Skew and Kurt values are in range -3 to 3 then no need to have a transformation.

In [None]:
# Outlier Removal : Z-Score method
# We will use z-score method to remove outliers as in univirate analysis we have seen our data is uniformally distributed
print('Before outlier removal : ',dataset.shape)
# cols = ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Area Population', 'Price']
z = np.abs(zscore(dataset, axis = 1))
threshold = 3
# print(np.where(z>threshold))
dataset = dataset[(z < threshold).all(axis=1)]
print('After Z-Score approach : ',dataset.shape)

# As we have Avg. Area Number of Bedrooms column is not normally distributed then let's use IQR method for outlier removal
cols = ['Avg. Area Number of Bedrooms']
Q1 = dataset.quantile(0.25)  # Q1
Q3 = dataset.quantile(0.75)  # Q3
IQR = Q3-Q1
dataset = dataset[~((dataset<(Q1-1.5*IQR)) | (dataset>(Q3+1.5*IQR)))]
print('After IQR approach : ',dataset.shape)

In [None]:
# Handling missing value 
# Let's impute missing values of column Avg. Area Income, Avg. Area Number of Rooms and Avg. Area Number of Bedrooms

dataset['Avg. Area Income'] = dataset['Avg. Area Income'].fillna(dataset['Avg. Area Income'].median())
dataset['Avg. Area Number of Rooms'] = dataset['Avg. Area Number of Rooms'].fillna(dataset['Avg. Area Number of Rooms'].mean())
dataset['Avg. Area Number of Bedrooms'] = dataset['Avg. Area Number of Bedrooms'].fillna(dataset['Avg. Area Number of Bedrooms'].mean())

# dataset['Avg. Area House Age'] = dataset['Avg. Area House Age'].fillna(dataset['Avg. Area House Age'].mean())
# dataset['Area Population'] = dataset['Area Population'].fillna(dataset['Area Population'].mean())
# dataset['Price'] = dataset['Price'].fillna(dataset['Price'].mean())


In [None]:
# Now we do no have any missing values
dataset.isnull().sum()

In [None]:
# Prepare data for train and test, first drop dependent variable
x = dataset.drop(columns=['Price'])
x

In [None]:
# Get dependent variable from  dataframe
y = dataset['Price']
y

###                                                                    ------------ END ------------