In [5]:
import numpy as np
import pandas as pd
import scipy.optimize as op
from pathlib import Path
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
import warnings
warnings.filterwarnings('ignore') # 実行に影響のない　warninig　を非表示にします. 非推奨.

In [6]:
#リッジ回帰を実装
class MyRegression(BaseEstimator, RegressorMixin):
    def __init__(self, lam=0):
        """
        Initialize a coefficient and an intercept.
        """
        self.lam = lam

    def fit(self, X, y):
        """
        X: array-like, shape (n_samples, n_features)
        y: array, shape (n_samples,)
        Estimate a coefficient and an intercept from data.
        """
        X, y = check_X_y(X, y, y_numeric=True)

        #adds a dummy intercept
        X_ = np.append(X, [[1] for i in range(len(X))], axis=1)
        #coef to tweak
        coef = np.array([1 for i in range(len(X_[0]))])

        #function to minimize
        def f(params):
            return ((y-np.matmul(X_, params)) ** 2).sum()+self.lam*(params**2).sum()

        #tweak coef, minimize function
        res = op.minimize(f, coef)

        self.coef_ = res.x[:-1]
        self.intercept_ = res.x[-1]
        return self

    def predict(self, X):
        """
        Calc y from X
        """
        check_is_fitted(self, "coef_", "intercept_")  # 学習済みかチェックする(推奨)
        X = check_array(X)
        X_ = np.append(X, [[1] for i in range(len(X))], axis=1)
        y = np.matmul(X_, np.append(self.coef_,self.intercept_))
        return y


In [7]:
def prep_x(X_df):
    #change 最多風向 to each own column.0-1
    X=pd.concat([X_df,pd.get_dummies(X_df["最多風向"])],axis=1)
    X=X_df.drop("最多風向",axis=1).values
    return X

In [8]:
data_dir=Path("../data/")
X_train=prep_x(pd.read_csv(data_dir / "X_train.csv"))
X_test=prep_x(pd.read_csv(data_dir / "X_test.csv"))
y_train=np.ravel(pd.read_csv(data_dir / "y_train.csv").values)

In [9]:
#一番下のセルgridsearchにより、最適なlamは1820
clf=MyRegression(lam=1820)
clf.fit(X_train,y_train)
print(clf.coef_)
print(clf.intercept_)

[ 0.04900464  0.07584589  0.02897993 -0.00439623  0.00011427  0.00662473
  0.0140017   0.04390732 -0.06508506 -0.10261269  0.00647107  0.00623762
 -0.00295842]
6.905452859658953e-05


In [10]:
y_predict=np.round(clf.predict(X_test),1)
ans_dict={"placeholder":y_predict}
ans_df=pd.DataFrame(ans_dict)
ans_df.to_csv(data_dir/"y_pred.csv",index=False)


In [11]:
#ここで以上。以下はグリッドサーチ等。グリッドサーチはryzen3950xで30分。

In [6]:
# Grid search of Ridge()
parameters = {'alpha':np.arange(-5000,5000)}
print(parameters)
reg = GridSearchCV(Ridge(),parameters,cv=5)
reg.fit(X_train,y_train)
best = reg.best_estimator_
# 決定係数
print("決定係数: ", best.score(X_train, y_train)) # BaseEstimatorを継承しているため使える
# lambda
print("lam: ", best.alpha)

{'alpha': array([-5000, -4999, -4998, ...,  4997,  4998,  4999])}
決定係数:  0.6360663087395901
lam:  2096


In [39]:
# Grid search of MyLinearRegression
parameters = {'lam':np.arange(-5000,5000)}
print(parameters)
reg = GridSearchCV(MyRegression(),parameters,cv=5)
reg.fit(X_train,y_train)
best = reg.best_estimator_
# 決定係数
print("決定係数: ", best.score(X_train, y_train)) # BaseEstimatorを継承しているため使える
# lambda 1820
print("lam: ", best.lam)

{'lam': array([-5000, -4999, -4998, ...,  4997,  4998,  4999])}
決定係数:  0.610779339360711
lam:  1820


In [8]:
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
X_pca=pca.fit_transform(X_train)
print(X_pca.shape)
print(pca.explained_variance_ratio_)
print(pca.components_)

(28, 2)
[0.58387849 0.27085845]
[[ 0.01676688  0.03082961  0.0143759   0.93738159  0.01972845  0.06350653
   0.14646301  0.0549713   0.10304277  0.02650805 -0.11122719 -0.11168056
  -0.23413121]
 [ 0.07622884  0.11850467  0.04265574  0.03057731  0.01751759  0.05344412
   0.10303293 -0.05285351 -0.55808295 -0.79890643 -0.0221221  -0.02302409
  -0.10045567]]
