In [1]:
""" import libraries """
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score
import io
import requests

In [2]:
""" Load dataset """
def load_dataset(url: str, sep: str = '\t', decode: str = 'utf-8'):
    """Load the csv datset

    Parameters
    ----------
    url : str
        target url of the dataset to be loaded
    sep : str
        sep argument from `pd.read_csv`
    decode : str
        decode method to be used in the dataframe

    Returns
    -------
    dataframe : pd.core.frame.DataFrame 
    """
    def requests_csv(url, decode):
        return (requests.get(url).content).decode(decode)
    return pd.read_csv(io.StringIO(requests_csv(url, decode)), sep=sep, error_bad_lines=True)


In [3]:
url_test, url_train = "https://raw.githubusercontent.com/Y4rd13/ILIAS/main/datasets/W05_test.txt", "https://raw.githubusercontent.com/Y4rd13/ILIAS/main/datasets/W05_train.txt"
train_df, test_df = load_dataset(url_train), load_dataset(url_test)

In [4]:
""" EDA"""
def analyze_data(train_df, test_df):
  print("number of columns: ", len(train_df.columns))
  print("columns names: ", train_df.columns)
  print("info: \n", train_df.info)
  print("head : \n", train_df.head)

In [5]:
""" cleaning """
def clean_dataset(train_df, test_df):
  train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
  test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
  train_df = train_df.dropna()
  test_df = test_df.dropna()
  return train_df, test_df

In [6]:
""" create dataset """
def split_dataset(train_df, test_df, num_cols, single_feature=0):
  if single_feature:
    X_train = train_df.iloc[ : , single_feature-1:single_feature]
    X_test = test_df.iloc[ : , single_feature-1:single_feature]
  else:
    X_train = train_df.iloc[ : , 0:num_cols]
    X_test = test_df.iloc[ : , 0:num_cols]
  
  Y_train = train_df["relative_humidity_3pm"]
  Y_test = test_df["relative_humidity_3pm"]
  
  # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

  return X_train, X_test, Y_train, Y_test, test_df 

In [7]:
def train_model(X_train, X_test, Y_train, Y_test, test_df, model_type="Linear", alpha=0):
  if model_type == "Linear":
    model = LinearRegression()
  elif model_type == "Ridge":
    model = Ridge(alpha=alpha) 
  elif model_type == "Lasso":
    model = Lasso(alpha=alpha)

  model.fit(X_train, Y_train)
  train_acc = model.score(X_train, Y_train)
  print('training acc :', train_acc)

  Y_pred = model.predict(X_test)
  print("r2_score: ",r2_score(Y_test, Y_pred))
  


In [8]:
train_df, test_df = clean_dataset(train_df, test_df)
""" train using 9,6,7 features """
for num_of_features in [9, 6, 7]:
  model_type="Linear"
  print("\n model_type : ", model_type)
  print("\n number of features : ", num_of_features)
  X_train, X_test, Y_train, Y_test, test_df = split_dataset(train_df, test_df, 8)
  train_model(X_train, X_test, Y_train, Y_test, test_df)
  print("==="*24)

""" train using 7th features """
print("\n 7th feature ")
model_type="Linear"
print("\n model_type : ", model_type)
X_train, X_test, Y_train, Y_test, test_df = split_dataset(train_df, test_df, 0, single_feature=7)
train_model(X_train, X_test, Y_train, Y_test, test_df)
print("==="*24)

""" with ridge """
print("\n with ridge ")
X_train, X_test, Y_train, Y_test, test_df = split_dataset(train_df, test_df, 9)
for alpha in [10,100]:
  model_type="Ridge"
  print("\n model_type : ", model_type)
  print("\n alpha : ", alpha)
  train_model(X_train, X_test, Y_train, Y_test, test_df, model_type=model_type, alpha=alpha)
  print("==="*24)

""" with lasso """
X_train, X_test, Y_train, Y_test, test_df = split_dataset(train_df, test_df, 9)
for alpha in [10]:
  model_type="Lasso"
  print("\n model_type : ", model_type)
  print("\n alpha : ", alpha)
  train_model(X_train, X_test, Y_train, Y_test, test_df, model_type=model_type, alpha=alpha)
  print("==="*24)


 model_type :  Linear

 number of features :  9
training acc : 0.7836718496784403
r2_score:  0.7274624557985638

 model_type :  Linear

 number of features :  6
training acc : 0.7836718496784403
r2_score:  0.7274624557985638

 model_type :  Linear

 number of features :  7
training acc : 0.7836718496784403
r2_score:  0.7274624557985638

 7th feature 

 model_type :  Linear
training acc : 0.7205504432516265
r2_score:  0.7479772044517505

 with ridge 

 model_type :  Ridge

 alpha :  10
training acc : 0.7859826056814028
r2_score:  0.7217155552690024

 model_type :  Ridge

 alpha :  100
training acc : 0.7661884367393521
r2_score:  0.7402975339860983

 model_type :  Lasso

 alpha :  10
training acc : 0.7361773012054045
r2_score:  0.74694688048163
