**Initialize**

Import packages

In [27]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

Read in train and test datasets

In [28]:
# read in the train dataset
train_data = pd.read_csv(
    'train.dat', 
     sep=',')

# read in the test dataset
test_data = pd.read_csv(
    'test.dat', 
     sep=',')

  train_data = pd.read_csv(
  test_data = pd.read_csv(


Split train dataset into train and target datasets

In [29]:
X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values

X_test = test_data.iloc[:, :].values

Create dataframes for train and test datasets

In [30]:
# convert to dataframes
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)

y_train_df = pd.DataFrame(y_train)

**Data Cleaning**

Fill missing values in train and test datasets with 0 to prepare for feature selection, dimensionality reduction, and model training

In [31]:
X_train_df = X_train_df.fillna(0)
X_test_df = X_test_df.fillna(0)

Identify mixed datatype columns for train and test datasets and make them all string type

In [32]:
# identify mixed-type columns 
for column in X_train_df.columns:
    types = X_train_df[column].apply(type).unique()
    if len(types) > 1:
        print(f"{column} has mixed types: {types}")
        X_train_df[column] = X_train_df[column].astype(str) # convert to string type

# identify mixed-type columns 
for column in X_test_df.columns:
    types = X_test_df[column].apply(type).unique()
    if len(types) > 1:
        print(f"{column} has mixed types: {types}")
        X_test_df[column] = X_test_df[column].astype(str)   # convert to string type

8 has mixed types: [<class 'str'> <class 'int'>]
478 has mixed types: [<class 'str'> <class 'int'>]
8 has mixed types: [<class 'str'> <class 'int'>]
478 has mixed types: [<class 'str'> <class 'int'>]


**Label Encoding**

Use LabelEncoder on train and test datasets to convert string type columns to numeric type to prepare for feature selection, dimensionality reduction, and model training

In [33]:
le = LabelEncoder()
# select columns with object dtype
string_cols = X_train_df.select_dtypes(include=['object']).columns

# label encode each column
for col in string_cols:
    X_train_df[col] = X_train_df[col].fillna('NaN') # fill in missing values with a placeholder
    X_train_df[col] = le.fit_transform(X_train_df[col])

In [34]:
# select columns with object dtype
string_cols = X_test_df.select_dtypes(include=['object']).columns

# label encode each column
for col in string_cols:
    X_test_df[col] = X_test_df[col].fillna('NaN') # fill in missing values with a placeholder
    X_test_df[col] = le.fit_transform(X_test_df[col])

**Validation Tests**

Find best K, n_components for feature selection and dimensionality reduction using the train dataset (split)

In [35]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train_df, y_train_df, test_size=0.35, random_state=42)

best_k = 0
best_n = 0
best_rmse = float('inf')

for k in range(1, 40, 1):
    # select k best features
    selector = SelectKBest(f_regression, k=k)
    X_tr_selected = selector.fit_transform(X_tr, y_tr)
    X_val_selected = selector.transform(X_val)
    for n in range(1, k, 1):
        # perform PCA
        pca = PCA(n_components=n)
        X_tr_pca = pca.fit_transform(X_tr_selected)
        X_val_pca = pca.transform(X_val_selected)
        lr = LinearRegression()
        lr.fit(X_tr_pca, y_tr)

        # predict on validation set
        y_pred_val = lr.predict(X_val_pca)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

        # update best rmse
        if rmse < best_rmse:
            best_k = k
            best_n = n
            best_rmse = rmse

# print best k, n, and rmse
print("Best k for SelectKBest:", best_k)
print("Best n for PCA:", best_n)
print("Best RMSE:", best_rmse)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

**Feature Selection**

Perform feature selection on train dataset using SelectKBest to select top k features

In [36]:
# select k best features
selector = SelectKBest(f_regression, k=best_k)

X_train_selected = selector.fit_transform(X_train_df, y_train_df)

selected_indices = selector.get_support(indices=True)

selected_features = X_train_df.columns[selected_indices]

X_test_selected = selector.transform(X_test_df)

  y = column_or_1d(y, warn=True)


**Dimensionality Reduction**

Perform dimensionality reduction on train dataset using PCA to reduce dimensionality

In [37]:
# perform PCA
pca = PCA(n_components=best_n)
X_train_pca = pca.fit_transform(X_train_selected)

X_test_pca = pca.transform(X_test_selected)

**Predict**

Make predictions on test dataset using trained model

In [38]:
lr = LinearRegression()

# fit model
lr.fit(X_train_pca, y_train_df)

# predict on test set
y_pred = lr.predict(X_test_pca)

# convert to dataframe
y_pred_df = pd.DataFrame(y_pred)

**Output**

Save predictions to csv file

In [39]:
y_pred_df.to_csv('output.dat', index=False, header=False)