In [98]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [99]:
def mean_squared_error(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    if y_true.shape != y_pred.shape:
        raise ValueError("Input shapes do not match.")

    print(y_true, y_pred)

    return np.mean((y_true - y_pred) ** 2)

## Dataset extraction

In [100]:
train_file_path = 'train.csv'
test_file_path  = 'test.csv'
test_target_file_path = "sample2.csv"

train_data = np.loadtxt(train_file_path, delimiter=',', skiprows=1)
test_data  = np.loadtxt(test_file_path,  delimiter=',', skiprows=1)

y_train = train_data[:, 1]

X_train = train_data[:, 2:]
X_test  = test_data [:, 1:]

# Display the shape of the target variable and features
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:",  X_test.shape)

Shape of X_train: (9000, 100)
Shape of X_test: (1000, 100)


## Training algorithms

### Linear regression
Just use it for fun (to compare with more advanced algorithms).

In [101]:
model = LinearRegression()
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred  = model.predict(X_test)

### Lasso regression
Just add L1-regularization to linear regression.

In [102]:
alpha = 0.01 # actually doesn't influence the MSE :(
model = Lasso(alpha=alpha)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred  = model.predict(X_test)

### Decision tree regression

In [103]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred  = model.predict(X_test)

### Random forest regression

In [104]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train.ravel())
y_train_pred = model.predict(X_train)
y_test_pred  = model.predict(X_test)

## Usage of trained model

In [105]:
mse_value = mean_squared_error(y_train, y_train_pred)
print(f"Mean Squared Error: {mse_value}")

[ 459.50946855  681.8726992  2087.12559972 ... 1163.12855946 3520.92142374
 1284.87716401] [ 923.96585181 1174.0796176  2172.21640755 ... 1391.9003466  2626.82266237
 1635.25060767]
Mean Squared Error: 457406.1246489475


## Generating output file in valid form

In [106]:
ids = np.array(list(range(len(y_test_pred))))
df_out = pd.DataFrame({'Id': ids, 'Category': y_test_pred})

df_out.to_csv(test_target_file_path, index=False)

print(f"CSV file '{test_target_file_path}' created successfully.")

CSV file 'sample2.csv' created successfully.
