In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import metrics

import pickle

In [2]:
df = pd.read_csv("../data/cleaned_data.csv") 

In [3]:
X = df.drop(columns=["Cycle length(days)"])

y = df["Cycle length(days)"].values

In [4]:
#split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=13)

In [5]:
sc = StandardScaler() 
  
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test) 

In [6]:
pca = PCA(n_components = 10) 
  
X_train = pca.fit_transform(X_train) 
X_test = pca.transform(X_test) 

In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
#Using pickle operation to serialize Logistic regression and save the serialized format to a file
filename = 'finalized_linear_regression.sav'
pickle.dump(lr, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))
y_pred = loaded_model.predict(X_test)

In [9]:
# y_pred = lr.predict(X_test)
y_pred = [int(var) for var in y_pred]
rmse = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
r2 = metrics.r2_score(y_test,y_pred)
print(rmse)
print(r2)

1.710471948239516
-0.257437970135848


In [10]:
average_rmse = 0
for var in range(0,542):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=var)
    
    sc = StandardScaler() 
    X_train = sc.fit_transform(X_train) 
    X_test = sc.transform(X_test) 
    
    pca = PCA(n_components = 1) 
    X_train = pca.fit_transform(X_train) 
    X_test = pca.transform(X_test) 
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_test)
    y_pred = [int(var) for var in y_pred]
    rmse = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
    average_rmse += rmse

In [11]:
print(average_rmse/542)

1.5995378693133153
