<a href="https://colab.research.google.com/github/SauravMaheshkar/Regression-Study/blob/main/notebooks/Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Author: [@SauravMaheshkar](https://twitter.com/MaheshkarSaurav)

# Packages 📦 and Basic Setup
---

## Install Packages

In [1]:
%%capture
!pip install -q --upgrade wandb

import os
import pandas as pd

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")

# Paste your api key here
os.environ["WANDB_API_KEY"] = '...'

## Project Configuration using **`wandb`**

In [2]:
import wandb

run = wandb.init(project='linear-regression-sklearn', entity='sauravmaheshkar', job_type = "dataset")

train_dataset = pd.read_csv("https://raw.githubusercontent.com/SauravMaheshkar/Regression-Study/main/data/raw/train.csv")
test_dataset = pd.read_csv("https://raw.githubusercontent.com/SauravMaheshkar/Regression-Study/main/data/raw/test.csv")

run.log({"Train Dataset" : wandb.Table(dataframe=train_dataset)})
run.log({"Test Dataset" : wandb.Table(dataframe=test_dataset)})

[34m[1mwandb[0m: Currently logged in as: [33msauravmaheshkar[0m (use `wandb login --relogin` to force relogin)


# 🧼 Data Pre-Processing
---

In [3]:
target = train_dataset.SalePrice

cols_with_missing = [col for col in train_dataset.columns 
                                 if train_dataset[col].isnull().any()]                                  
candidate_train_predictors = train_dataset.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)

low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].nunique() < 10 and
                                candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].dtype in ['int64', 'float64']]
my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]

one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

run.log({"One Hot Encoded Dataset" : wandb.Table(dataframe=one_hot_encoded_training_predictors)})

# 💪🏻 Model + Training 
---

In [5]:
from sklearn.linear_model import LinearRegression, Ridge

reg = LinearRegression().fit(one_hot_encoded_training_predictors, target)

In [6]:
wandb.sklearn.plot_learning_curve(reg, one_hot_encoded_training_predictors, target)
run.finish()