# Data Exploration:
## Here:
- Handle missing values and outliers.
- Encode categorical variables.
- Normalize/standardize numerical features.
- Split the data into training and testing sets.


- Script: scripts/data_preprocessing.py   (This file contains necessary reusable codes and function that we use down below)

#### Settings to make scripts in scripts/ accessible.  
- this creates a .vscode in a root that includes the root path as well as __pycache__ in scripts directory

In [6]:
import sys
import os

# Add the project root directory to the system path
sys.path.append(os.path.abspath(os.path.join("..", "scripts")))

#### Necessary imports

In [7]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
import data_preprocessing as dp  # This is a script file. 

#### Load data

In [8]:
df = dp.read_csv(path='../data/boston_house_prices.csv')
df.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


#### Handle missing values and outliers

In [9]:
# dropping the missing values using the function coded in data_preprocessing.py

df = dp.handle_missing_values(df)
df.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


In [10]:
# Checking and handling (if any) outliers using IQR statistics 

outliers_info , df_filterd = dp.handling_outliers(df)

print(f"Outliers information:  {outliers_info}")

Outliers information:  {'CRIM': 66, 'ZN': 45, 'INDUS': 0, 'CHAS': 34, 'NOX': 12, 'RM': 21, 'AGE': 0, 'DIS': 0, 'RAD': 55, 'TAX': 5, 'PTRATIO': 8, 'B': 25, 'LSTAT': 5, 'MEDV': 16}


In [11]:
#Data frame after removing outliers
df_filterd

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
5,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15,27.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


#### Observation: with the IQR check. Using the lower bounds and upper bounds techniques multiple outliers were observed and removed to make ML work efficiently

#### Encode categorical variables

######  No categorical varaibles present in the dataset.  There are numeric data and cannot be limit to certain replacement.

#### Normalize/Standardize numerical features.

In [12]:
# Separate X and Y. X: Is everything expect the target, and Y is the target. i.e. Price
X = df_filterd.drop('MEDV', axis=1) 
Y = df_filterd['MEDV']

In [21]:
# NOrmalize X

X_normalised = dp.normalize_data(X)


In [27]:
# Standardize data
X_scaled, scaler = dp.standardize_data(X_normalised)

X_scaled

array([[-0.7851603 ,  0.89460434, -1.20901013, ..., -2.02620598,
         0.80946141, -1.36074317],
       [-0.71097775, -0.57240967, -0.29807632, ..., -0.52707695,
         0.80946141, -0.34769907],
       [-0.70200091, -0.57240967, -1.23388857, ...,  0.0126095 ,
         0.39213921, -1.30473352],
       ...,
       [-0.59275923, -0.57240967,  0.63199476, ...,  1.3918082 ,
         0.80946141, -1.20001983],
       [-0.42018495, -0.57240967,  0.63199476, ...,  1.3918082 ,
         0.29156155, -0.99546285],
       [-0.63994061, -0.57240967,  0.63199476, ...,  1.3918082 ,
         0.80946141, -0.65453454]])

#### Split the data into training and testing sets.

#### Splitted to 80%-20% (training - testing) data

In [31]:
X_train, X_test, y_train, y_test = dp.split_data(X, Y, 0.2)
print(X_train, X_test)

        CRIM   ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
104  0.13960  0.0   8.56     0  0.520  6.167  90.0  2.4210    5  384     20.9   
297  0.14103  0.0  13.92     0  0.437  5.790  58.0  6.3200    4  289     16.0   
139  0.54452  0.0  21.89     0  0.624  6.151  97.9  1.6687    4  437     21.2   
213  0.14052  0.0  10.59     0  0.489  6.375  32.3  3.9454    4  277     18.6   
122  0.09299  0.0  25.65     0  0.581  5.961  92.9  2.0869    2  188     19.1   
..       ...  ...    ...   ...    ...    ...   ...     ...  ...  ...      ...   
136  0.32264  0.0  21.89     0  0.624  5.942  93.5  1.9669    4  437     21.2   
19   0.72580  0.0   8.14     0  0.538  5.727  69.5  3.7965    4  307     21.0   
119  0.14476  0.0  10.01     0  0.547  5.731  65.2  2.7592    6  432     17.8   
322  0.35114  0.0   7.38     0  0.493  6.041  49.9  4.7211    5  287     19.6   
131  1.19294  0.0  21.89     0  0.624  6.326  97.7  2.2710    4  437     21.2   

          B  LSTAT  
104  3

In [32]:
print(y_train, y_test)

104    20.1
297    20.3
139    17.8
213    28.1
122    20.5
       ... 
136    17.4
19     18.2
119    19.3
322    20.4
131    19.6
Name: MEDV, Length: 171, dtype: float64 13     20.4
342    16.5
86     22.5
336    19.5
178    29.9
172    23.1
20     13.6
111    22.8
96     21.4
245    18.5
325    24.6
313    21.6
242    22.2
93     25.0
321    23.1
309    20.3
335    21.1
23     14.5
63     25.0
327    22.2
127    16.2
501    22.4
503    23.9
340    18.7
249    26.2
133    18.4
43     24.7
36     20.0
318    23.1
21     19.6
75     21.4
239    23.3
294    21.7
120    22.0
89     28.7
495    23.1
314    23.8
270    21.1
125    21.4
109    19.4
130    19.2
350    22.9
80     28.0
Name: MEDV, dtype: float64


#### Inorder to use testing and training data explicitely across the project, let us save the file in CSV format (.csv)

In [33]:
# Change np array to dataframe to save to CSV using dataframe

X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)

In [34]:
X_train_df.to_csv('../data/X_train.csv', index= False)
X_test_df.to_csv('../data/X_test.csv', index= False)

In [41]:
X.to_csv('../data/only_x.csv', index= False)
Y.to_csv('../data/only_y.csv', index= False)

In [35]:
# Saving remaining np array without changing to pandas dataframes

np.savetxt('../data/y_train.csv',y_train, delimiter=',' )
np.savetxt('../data/y_test.csv',y_test, delimiter=',' )

#### 