In [14]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [15]:
df = pd.read_csv("cleaned_data.csv")
df.dtypes

time                    object
Dishwasher             float64
Home office            float64
Fridge                 float64
Wine cellar            float64
Garage door            float64
Barn                   float64
Well                   float64
Microwave              float64
Living room            float64
temperature            float64
humidity               float64
visibility             float64
apparentTemperature    float64
pressure               float64
windSpeed              float64
cloudCover             float64
windBearing            float64
precipIntensity        float64
dewPoint               float64
precipProbability      float64
Furnace                float64
Kitchen                float64
year                     int64
month                    int64
day                      int64
weekday                 object
weekofyear               int64
hour                     int64
minute                   int64
timing                  object
use                    float64
gen     

In [16]:
df_types = pd.DataFrame({'Column': df.columns, 'Type': df.dtypes.values})
print(df_types[22:])

        Column     Type
22     Kitchen  float64
23        year    int64
24       month    int64
25         day    int64
26     weekday   object
27  weekofyear    int64
28        hour    int64
29      minute    int64
30      timing   object
31         use  float64
32         gen  float64


In [17]:
df.columns

Index(['time', 'Dishwasher', 'Home office', 'Fridge', 'Wine cellar',
       'Garage door', 'Barn', 'Well', 'Microwave', 'Living room',
       'temperature', 'humidity', 'visibility', 'apparentTemperature',
       'pressure', 'windSpeed', 'cloudCover', 'windBearing', 'precipIntensity',
       'dewPoint', 'precipProbability', 'Furnace', 'Kitchen', 'year', 'month',
       'day', 'weekday', 'weekofyear', 'hour', 'minute', 'timing', 'use',
       'gen'],
      dtype='object')

In [18]:
df.nunique()

time                   503910
Dishwasher               9595
Home office             23272
Fridge                  13986
Wine cellar             10738
Garage door              4567
Barn                    19906
Well                     6329
Microwave                4386
Living room              7342
temperature              5063
humidity                   86
visibility                705
apparentTemperature      5295
pressure                 3017
windSpeed                1671
cloudCover                 77
windBearing               360
precipIntensity           433
dewPoint                 4931
precipProbability          85
Furnace                 81837
Kitchen                 13492
year                        1
month                      12
day                        31
weekday                     7
weekofyear                 51
hour                       24
minute                     60
timing                      4
use                    130103
gen                     29904
dtype: int

feature selection

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503910 entries, 0 to 503909
Data columns (total 33 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   time                 503910 non-null  object 
 1   Dishwasher           503910 non-null  float64
 2   Home office          503910 non-null  float64
 3   Fridge               503910 non-null  float64
 4   Wine cellar          503910 non-null  float64
 5   Garage door          503910 non-null  float64
 6   Barn                 503910 non-null  float64
 7   Well                 503910 non-null  float64
 8   Microwave            503910 non-null  float64
 9   Living room          503910 non-null  float64
 10  temperature          503910 non-null  float64
 11  humidity             503910 non-null  float64
 12  visibility           503910 non-null  float64
 13  apparentTemperature  503910 non-null  float64
 14  pressure             503910 non-null  float64
 15  windSpeed        

In [20]:
df_numeric = df.select_dtypes(include=['int64', 'float64']) #sadece sayısal sütunları alacağız, ligthGBM için


In [21]:
#hedefi belirleyeceğiz, biz burada toplam enerji kullanımını yani use'u seçtik
y = df_numeric['use']
X = df_numeric.drop(columns=['use'])

In [22]:
# train/test bölüp modeli eğitelim
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = lgb.LGBMRegressor()
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005597 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 403128, number of used features: 28
[LightGBM] [Info] Start training from score 0.859949




In [23]:
#feature importance'ı alıp en önemli 10 özelliği seçeceğiz
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
4,Garage door,333
6,Well,260
20,Furnace,257
21,Kitchen,208
5,Barn,205
8,Living room,192
1,Home office,169
28,gen,163
7,Microwave,146
26,hour,133


In [24]:
top8_features = importance_df.head(8)['Feature'].tolist()
print("En önemli özellikler:", top8_features)

En önemli özellikler: ['Garage door', 'Well', 'Furnace', 'Kitchen', 'Barn', 'Living room', 'Home office', 'gen']


In [25]:
df_reduced = df[top8_features + ['use', 'time']]
df_reduced.to_csv("reduced_data.csv", index=False)

In [26]:
df_reduced

Unnamed: 0,Garage door,Well,Furnace,Kitchen,Barn,Living room,Home office,gen,use,time
0,0.013083,0.001017,0.082617,0.000567,0.031350,0.001517,0.442633,0.003483,0.932833,2016-01-01 05:00:00
1,0.013117,0.001017,0.084533,0.000567,0.031500,0.001650,0.444067,0.003467,0.934333,2016-01-01 05:01:00
2,0.013083,0.001000,0.083017,0.000617,0.031517,0.001650,0.446067,0.003467,0.931817,2016-01-01 05:02:00
3,0.013000,0.001017,0.175417,0.000650,0.031500,0.001617,0.446583,0.003483,1.022050,2016-01-01 05:03:00
4,0.012783,0.001017,0.300917,0.000783,0.031500,0.001583,0.446533,0.003467,1.139400,2016-01-01 05:04:00
...,...,...,...,...,...,...,...,...,...,...
503905,0.013483,0.000983,0.727683,0.000633,0.032283,0.000967,0.041783,0.003183,1.601233,2016-12-16 03:25:00
503906,0.013433,0.000950,0.729050,0.000600,0.032200,0.000933,0.041750,0.003233,1.599333,2016-12-16 03:26:00
503907,0.012933,0.001000,1.060117,0.000600,0.032283,0.001000,0.042033,0.003217,1.924267,2016-12-16 03:27:00
503908,0.012817,0.000950,1.116033,0.000650,0.032183,0.000950,0.042100,0.003217,1.978200,2016-12-16 03:28:00
