**Initialize**

Import packages

In [81]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.decomposition import PCA, TruncatedSVD, KernelPCA, FastICA
from sklearn.random_projection import SparseRandomProjection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import time

Read in train and test datasets

In [82]:
# read in the train dataset
train_data = pd.read_csv(
    'train.dat', 
     sep=',')

# read in the test dataset
test_data = pd.read_csv(
    'test.dat', 
     sep=',')

  train_data = pd.read_csv(
  test_data = pd.read_csv(


Split train dataset into train and target datasets

In [83]:
X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values

X_test = test_data.iloc[:, :].values

Create dataframes for train and test datasets

In [84]:
# convert to dataframes
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)

y_train_df = pd.DataFrame(y_train)

**Data Cleaning**

Fill missing values in train and test datasets with 0 to prepare for feature selection, dimensionality reduction, and model training

In [85]:
X_train_df = X_train_df.fillna(0)
X_test_df = X_test_df.fillna(0)

Identify mixed datatype columns for train and test datasets and make them all string type

In [86]:
# identify mixed-type columns 
for column in X_train_df.columns:
    types = X_train_df[column].apply(type).unique()
    if len(types) > 1:
        print(f"{column} has mixed types: {types}")
        X_train_df[column] = X_train_df[column].astype(str) # convert to string type

# identify mixed-type columns 
for column in X_test_df.columns:
    types = X_test_df[column].apply(type).unique()
    if len(types) > 1:
        print(f"{column} has mixed types: {types}")
        X_test_df[column] = X_test_df[column].astype(str)   # convert to string type

8 has mixed types: [<class 'str'> <class 'int'>]
478 has mixed types: [<class 'str'> <class 'int'>]
8 has mixed types: [<class 'str'> <class 'int'>]
478 has mixed types: [<class 'str'> <class 'int'>]


**Label Encoding**

Use LabelEncoder on train and test datasets to convert string type columns to numeric type to prepare for feature selection, dimensionality reduction, and model training

In [87]:
le = LabelEncoder()
# select columns with object dtype
string_cols = X_train_df.select_dtypes(include=['object']).columns

# label encode each column
for col in string_cols:
    X_train_df[col] = X_train_df[col].fillna('NaN') # fill in missing values with a placeholder
    X_train_df[col] = le.fit_transform(X_train_df[col])

In [88]:
# select columns with object dtype
string_cols = X_test_df.select_dtypes(include=['object']).columns

# label encode each column
for col in string_cols:
    X_test_df[col] = X_test_df[col].fillna('NaN') # fill in missing values with a placeholder
    X_test_df[col] = le.fit_transform(X_test_df[col])

**Validation Tests**

Find best K, n_components for feature selection and dimensionality reduction using the train dataset (split)

In [89]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train_df, y_train_df, test_size=0.2, random_state=42)

best_k = 0
best_n = 0
best_rmse = float('inf')
best_model = None
best_dim_red = None
best_feat_sel = None

models = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    ElasticNet(),
    DecisionTreeRegressor(),
    RandomForestRegressor()
]

dim_reds = [
    PCA(), 
    TruncatedSVD(),
    FastICA(),
]

feat_sels = [
    SelectKBest(f_regression),
]

for k in range(2, 20, 1):
    for feat_sel in feat_sels:
        feat_sel.k = k
        X_tr_selected = feat_sel.fit_transform(X_tr, y_tr)
        X_val_selected = feat_sel.transform(X_val)

        for n in range(2, k+1, 1):
            for dim_red in dim_reds:
                # fit dimensionality reduction model on training set
                dim_red.fit(X_tr_selected, y_tr)

                # transform training and validation sets
                X_tr_red = dim_red.transform(X_tr_selected)
                X_val_red = dim_red.transform(X_val_selected)
            
                for model in models:
                    cur = time.time()
                    # fit model on training set
                    model.fit(X_tr_red, y_tr)

                    # predict on validation set
                    y_pred_val = model.predict(X_val_red)
                    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

                    # print("Model: ", model, rmse, time.time() - cur)

                    # update best rmse and model
                    if rmse < best_rmse:
                        best_k = k
                        best_n = n
                        best_rmse = rmse
                        best_model = model
                        best_dim_red = dim_red
                        best_feat_sel = feat_sel

# print best k, n, and rmse
print("Best k for feature selection:", best_k)
print("Best n for dimensionality reduction:", best_n)
print("Best Feature Selection:", best_feat_sel)
print("Best Dimensionality Reduction:", best_dim_red)
print("Best model:", best_model)
print("Best RMSE:", best_rmse)

  y = column_or_1d(y, warn=True)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  y = column_or_1d(y, warn=True)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  y = column_or_1d(y, warn=True)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  y = column_or_1d(y, warn=True)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  model.fit(X_tr_red, y_tr)
  y = column_or_1d(y, warn=T

Best k for feature selection: 8
Best n for dimensionality reduction: 2
Best Feature Selection: SelectKBest(k=19, score_func=<function f_regression at 0x00000228FF8AC550>)
Best Dimensionality Reduction: PCA()
Best model: LinearRegression()
Best RMSE: 4.733717363589612


**Feature Selection**

Perform feature selection on train dataset using SelectKBest to select top k features

In [90]:
# select k best features
best_feat_sel.k = best_k

X_train_selected = best_feat_sel.fit_transform(X_train_df, y_train_df)

selected_indices = best_feat_sel.get_support(indices=True)

selected_features = X_train_df.columns[selected_indices]

X_test_selected = best_feat_sel.transform(X_test_df)

  y = column_or_1d(y, warn=True)


**Dimensionality Reduction**

Perform dimensionality reduction on train dataset using PCA to reduce dimensionality

In [91]:
# perform PCA
best_dim_red.n_components = best_n

X_train_red = best_dim_red.fit_transform(X_train_selected)

X_test_red = best_dim_red.transform(X_test_selected)

**Predict**

Make predictions on test dataset using trained model

In [92]:
# fit model
best_model.fit(X_train_red, y_train_df)

# predict on test set
y_pred = best_model.predict(X_test_red)

# convert to dataframe
y_pred_df = pd.DataFrame(y_pred)

**Output**

Save predictions to csv file

In [93]:
y_pred_df.to_csv('output.dat', index=False, header=False)