In [84]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display

In [None]:
url = 'https://raw.githubusercontent.com/Totemi1324/neural-networks-demo/main/datasets/melb_data.csv';
data = pd.read_csv(url)

def preprocess_dataset(raw):
  y = raw.Price
  features = raw.drop(['Price'], axis=1)
  X = features.select_dtypes(exclude=['object'])
  return X, y

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    print(f"MAE: {mean_absolute_error(y_valid, preds)}")

X_display, y_display = preprocess_dataset(data)
display(X_display)

### Excercise 1: Handling NaN values

Option 1: Drop all columns with NaN values

![Drop NaN columns](https://raw.githubusercontent.com/Totemi1324/neural-networks-demo/main/assets/drop_nan_columns.PNG)

In [None]:
data1 = data.dropna(axis=1)
X_processed, y = preprocess_dataset(data1)

display(X_processed)

Option 2: Drop all rows with NaN values

In [None]:
data2 = data.dropna()
X_processed, y = preprocess_dataset(data2)

display(X_processed)

Option 3: Imputation; fill NaN fields with the mean of their column

![Drop NaN columns](https://raw.githubusercontent.com/Totemi1324/neural-networks-demo/main/assets/imputation.PNG)

In [None]:
X, y = preprocess_dataset(data)

imputer = SimpleImputer()
X_processed = pd.DataFrame(imputer.fit_transform(X))
X_processed.columns = X.columns

display(X_processed)

Option 4: Extend imputation by an additional column

![Drop NaN columns](https://raw.githubusercontent.com/Totemi1324/neural-networks-demo/main/assets/imputation_extended.PNG)

In [None]:
X, y = preprocess_dataset(data)

columns_with_missing = [column for column in X.columns if X[column].isnull().any()]

for column in columns_with_missing:
    X[column + '_was_missing'] = X[column].isnull()

imputer = SimpleImputer()
X_processed = pd.DataFrame(imputer.fit_transform(X))
X_processed.columns = X.columns

display(X_processed)

Choose one option of the above. To evaluate, execute the code block below:

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_processed, y, train_size=0.8, test_size=0.2, random_state=0)
score_dataset(X_train, X_valid, y_train, y_valid)

train_combined = X_train.copy()
train_combined["Price"] = y_train

train_combined.boxplot(by="Rooms", column=["Price"])

### Excercise 2: Changing train/test split

Choose the fractions for the train and test sets:

In [120]:
train_percentage = 0.85
test_percentage = 0.15

To evaluate, execute the code block below:

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_processed, y, train_size=train_percentage, test_size=test_percentage, random_state=0)

score_dataset(X_train, X_valid, y_train, y_valid)

### Excercise 3: Normalizing numerical data

Turn on/off normalization:

In [122]:
should_normalize = False

To evaluate, execute the code block below:

In [None]:
if should_normalize:
  X_train_norm = X_train.div(X_train.sum(axis=1), axis=0)
  X_valid_norm = X_valid.div(X_valid.sum(axis=1), axis=0)
else:
  X_train_norm = X_train.copy()
  X_valid_norm = X_valid.copy()

display(X_train_norm)

score_dataset(X_train_norm, X_valid_norm, y_train, y_valid)

sns.pairplot(X_train_norm[['Rooms', 'BuildingArea', 'PropertyCount']], diag_kind='kde')

### Useful places to get data from (for practice or real-world applications):

- [Kaggle](https://www.kaggle.com/): Largest repository of publicly available datasets (over 50 000) for data science applications
- [Data.gov](https://data.gov/): US government's open data repository. Topics: agriculture and climate, energy, marine transportation, and more.
- [NASA Open Data Portal](https://data.nasa.gov/): Catalog of publicly available NASA datasets. Topics: national aeronautics and space data, physical oceanography, ocean biology data, earth resources observations, social-economic data, and more.
- [Earthdata by NASA](https://earthdata.nasa.gov/): NASA dataset catalog partition for earth-related collections. Topics: atmosphere, land, ocean, cryosphere, and more.
- [NASDAQ Data Link](https://data.nasdaq.com/): Data source from the world's largest technology fund. Topics: finance and economics, stock prices, trading activity, interest rate dynamics, and more.

There are many projects that start by first scouting for interesting datasets, so feel free to browse and let these inspire you!