# Homework Wet Assignment 3 - Regression

In [1]:
# imports

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from LinearRegressor import LinearRegressor as LR
from test_lr import test_lr
from verify_gradients import compare_gradients

## Preliminary: Updated Data Loading

### Load data

In [2]:
# load data set
def load_data(filename) -> np.ndarray:
    with open(f'data/{filename}') as file:
        data = pd.read_csv(file)
        return data

dataset = load_data('virus_data.csv')

# divide to train and test using the ID's (same method as in Wet HW 1)
id1 = 21
id2 = 9
train = dataset.sample(frac=0.8, random_state=id1 + id2)
test = dataset.drop(train.index)

In [3]:
# make sure data is loaded correctly (commented in submission)
# assignment 2 data used

# train_hw2_path = '../wet_hw2/data/train_prepared.csv'
# test_hw2_path = '../wet_hw2/data/test_prepared.csv'
#
# def equals(l1, l2):
#     return len(l1) == len(l2) and all(v == u for v, u in zip(l1, l2))
#
#
# with open(train_hw2_path) as file:
#     train_hw2_data = pd.read_csv(file, index_col=0)
#     assert equals(train_hw2_data['patient_id'], train['patient_id'])
#
# with open(test_hw2_path) as file:
#     test_hw2_data = pd.read_csv(file, index_col=0)
#     assert equals(test_hw2_data['patient_id'], test['patient_id'])

### Preprocessing
Exactly as done in Wet HW 1.

In [8]:
def get_missing_data_stats(ds: pd.DataFrame) -> dict:
    """
    Go over the columns of train and test and check for each column how many missing data is there
    Arg: ds - the dataset
    returns: a list of the names
    """

    has_nan_columns = {}
    for col_name in ds.columns:
        column = ds[col_name]

        nan_mask = pd.isnull(column) # mark for each line if NaN or not
        if nan_mask.any():
            has_nan_columns[col_name] = nan_mask.value_counts()[True]
    return has_nan_columns


print(f'{" ":10}> Train set - missing data')

missing_stats_train = get_missing_data_stats(train)
for col_name, missing in missing_stats_train.items():
    print(f'in column {col_name} - missing: {missing}')
print('-' * 70)

print(f'{" ":10}> Test set - missing data')

missing_stats_test = get_missing_data_stats(test)
for col_name, missing in missing_stats_test.items():
    print(f'in column {col_name} - missing: {missing}')
print('-' * 70)

for col_name in missing_stats_train.keys():
    print(f'The mean of {col_name} is - \t{train[col_name].mean()}')
    print(f'The median of {col_name} is - \t{train[col_name].median()}')

          > Train set - missing data
in column PCR_03 - missing: 53
----------------------------------------------------------------------
          > Test set - missing data
in column PCR_03 - missing: 21
----------------------------------------------------------------------
The mean of PCR_03 is - 	0.521187504661258
The median of PCR_03 is - 	-0.0070285297453213


In [11]:
for col_name in missing_stats_train.keys():
    train[col_name]=train[col_name].fillna('median')

for col_name in missing_stats_test.keys():
    test[col_name]=test[col_name].fillna('median')

missing_stats_train = get_missing_data_stats(train)
for col_name, missing in missing_stats_train.items():
    print(f'in column {col_name} - missing: {missing}')
print('-' * 70)

print(f'{" ":10}> Test set - missing data')

missing_stats_test = get_missing_data_stats(test)
for col_name, missing in missing_stats_test.items():
    print(f'in column {col_name} - missing: {missing}')
print('-' * 70)

----------------------------------------------------------------------
          > Test set - missing data
----------------------------------------------------------------------


In [5]:
def normalize(
        test: pd.DataFrame,
        train: pd.DataFrame,
        mm_scale_columns: list,
        z_scale_columns: list
) -> (pd.DataFrame, pd.DataFrame):
    scaler_mm = MinMaxScaler(feature_range=(-1, 1))
    scaler_z = StandardScaler()

    scaler_mm.fit(train[mm_scale_columns])
    train[mm_scale_columns] = scaler_mm.transform(train[mm_scale_columns])
    scaler_mm.fit(test[mm_scale_columns])
    test[mm_scale_columns] = scaler_mm.transform(test[mm_scale_columns])

    scaler_z.fit(train[z_scale_columns])
    train[z_scale_columns] = scaler_z.transform(train[z_scale_columns])
    scaler_z.fit(test[z_scale_columns])
    test[z_scale_columns] = scaler_z.transform(test[z_scale_columns])
    return train, test

def fill_nan_with_median(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()  # to avoid modifying original DataFrame
    for col_name in get_missing_data_stats(df).keys():
        if df[col_name].dtype.kind in 'biufc':  # only numeric types
            df[col_name] = df[col_name].fillna('median')
    return df

# train = fill_nan_with_median(train)

# for col_name in get_missing_data_stats(train).keys():
#     print(f'The mean of {col_name} is - \t{train[col_name].mean()}')
#     print(f'The median of {col_name} is - \t{train[col_name].median()}')
#
for col_name in missing_stats_train.keys():
     train[col_name].fillna('median')
for col_name in missing_stats_test.keys():
     test[col_name].fillna('median')

# test = fill_nan_with_median(test)
# print(get_missing_data_stats(train))
assert get_missing_data_stats(train) == {}
assert get_missing_data_stats(test) == {}
print('assert success')


mm_scale_columns = ['PCR_01', 'PCR_02', 'PCR_04', 'PCR_06', 'PCR_08']
z_scale_columns = ['PCR_03', 'PCR_05', 'PCR_07', 'PCR_09', 'PCR_10']
train, test = normalize(train, test, mm_scale_columns, z_scale_columns)

AssertionError: 

## Section 1: Linear regression implementation

### load new division for the dataset

In [11]:
# divide to train and test using the ID's (same method as in Wet HW 1)
train_new = dataset.sample(frac=0.8)
test_new = dataset.drop(train.index)

train_new = fill_nan_with_median(train_new)
test_new = fill_nan_with_median(test_new)
train_new, test_new = normalize(train_new, test_new, mm_scale_columns, z_scale_columns)
X_train = train_new.drop(columns=['contamination_level']).to_numpy()
Y_train = train_new['contamination_level'].to_numpy()


assert np.all(np.isfinite(X_train)), "X_train contains NaN or Inf"
assert np.all(np.isfinite(Y_train)), "Y_train contains NaN or Inf"
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)

compare_gradients(X_train, Y_train, deltas=np.logspace(-7, -2, 9))

NameError: name 'fill_nan_with_median' is not defined

### Q2