-
Notifications
You must be signed in to change notification settings - Fork 1
/
train_score.py
79 lines (60 loc) · 2.37 KB
/
train_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
import json
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from future_encoders import ColumnTransformer
from future_encoders import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
# SELECTOR
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
def file_exists(file):
return os.path.exists(file)
def load_data(file):
if file_exists("Data/data.pkl"):
return pd.read_pickle("Data/data.pkl")
data = json.load(open(file))
list_of_series = []
for key, user in data.items():
for day in user:
for activity, entries in day.items():
for i in range(len(entries["time"])):
entry = [activity, entries["score"][i],
entries["time"][i],
entries["mood"][i],
entries["weather"][i],
entries["temperature"][i]]
series = pd.Series(entry, index=['activity', 'score', 'time', 'mood', 'weather', 'temperature'])
list_of_series.append(series)
df = pd.DataFrame(list_of_series)
df.to_pickle("Data/data.pkl")
return df
if __name__ == "__main__":
# Load the dataset
df = load_data("Data/data.json")
# Select features and target
features = df.drop("score", axis=1)
y = df["score"].copy()
numeric_values = features.drop(["mood", "weather", "activity"], axis=1) # returns a copy of the dataframe
num_attribs = list(numeric_values)
cat_attribs = ["mood", "weather", "activity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)), # Own transformation
('imputer', Imputer(strategy="median")),
('std_scaler', StandardScaler()),
])
full_pipeline = ColumnTransformer([
("num_pipline", num_pipeline, num_attribs),
("cat_pipline", OneHotEncoder(), cat_attribs),
])
X = full_pipeline.fit_transform(features)
forest_reg = RandomForestRegressor()
forest_reg.fit(X, y)