# Lasso

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso

In [2]:
data_train = pd.read_csv("processed_training.csv")
data_test = pd.read_csv("processed_test.csv")

### Data Preparation

##### ensure the column orders are the same in train and test

In [3]:
train_cols = data_train.columns.tolist()
data_test = data_test[train_cols].copy()

##### drop unnecessary columns, these columns won't be useful in analysis and prediction

In [4]:
data_train.drop(['venue', 'start_hour'], axis=1, inplace=True)
data_test.drop(['venue', 'start_hour'], axis=1, inplace=True)

##### get dummies for team names and season type

In [5]:
data_train = pd.get_dummies(data_train, columns=['team1_name', 'team2_name', 'season_type'])
data_test = pd.get_dummies(data_test, columns=['team1_name', 'team2_name', 'season_type'])

In [6]:
# ensure the train and test data have the same columns

train_cols = set(data_train.columns)
test_cols = set(data_test.columns)

print(train_cols - test_cols)
print(test_cols - train_cols)

set()
set()


##### shuffle the training data

In [7]:
from sklearn.utils import shuffle

data_train = shuffle(data_train)

##### prepare numpy arrays for training and testing

In [9]:
Y_train = data_train['attendance'].to_numpy()
X_train = data_train.drop(['attendance'], axis=1).to_numpy()

Y_test = data_test['attendance'].to_numpy()
X_test = data_test.drop(['attendance'], axis=1).to_numpy()

# print shape of data
print("X_train: ", X_train.shape)
print("Y_train", Y_train.shape)
print("X_test", X_test.shape)
print("Y_test", Y_test.shape)

X_train:  (12270, 273)
Y_train (12270,)
X_test (2445, 273)
Y_test (2445,)
