In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Step 1: データの読み込み
status_data = pd.read_csv('/home-asustor/teramoto/SIGNATE/SOTA_Challenge/Share_Cycle/data/status.csv')
trip_data = pd.read_csv('/home-asustor/teramoto/SIGNATE/SOTA_Challenge/Share_Cycle/data/trip.csv')
weather_data = pd.read_csv('/home-asustor/teramoto/SIGNATE/SOTA_Challenge/Share_Cycle/data/weather.csv')

# Step 2: 必要なカラムの抽出
# 天気データの日付と時間を抽出
weather_data['date'] = pd.to_datetime(weather_data['date'])
weather_data['year'] = weather_data['date'].dt.year
weather_data['month'] = weather_data['date'].dt.month
weather_data['day'] = weather_data['date'].dt.day

# trip_data の特徴量を作成
trip_data['start_date'] = pd.to_datetime(trip_data['start_date'])
trip_data['year'] = trip_data['start_date'].dt.year
trip_data['month'] = trip_data['start_date'].dt.month
trip_data['day'] = trip_data['start_date'].dt.day
trip_data['hour'] = trip_data['start_date'].dt.hour

# 出発数と到着数のカウント
start_count = trip_data.groupby(['year', 'month', 'day', 'hour', 'start_station_id']).size().reset_index(name='start_number')
end_count = trip_data.groupby(['year', 'month', 'day', 'hour', 'end_station_id']).size().reset_index(name='end_number')

# Step 3: データのマージ
# status_data と weather_data のマージ
merged_data = pd.merge(status_data, weather_data, on=['year', 'month', 'day'], how='left')

# start_count と end_count を merged_data にマージ
merged_data = pd.merge(merged_data, start_count, left_on=['year', 'month', 'day', 'hour', 'station_id'], right_on=['year', 'month', 'day', 'hour', 'start_station_id'], how='left')
merged_data = pd.merge(merged_data, end_count, left_on=['year', 'month', 'day', 'hour', 'station_id'], right_on=['year', 'month', 'day', 'hour', 'end_station_id'], how='left')

# 欠損値を0で埋める
merged_data.fillna(0, inplace=True)

# Step 4: モデル訓練のためのデータ作成
# predict が 0 のデータを使って学習データを作成
train_data = merged_data[merged_data['predict'] == 0]

# 使用する特徴量
features = ['start_number', 'end_number', 'mean_temperature', 'mean_humidity', 'mean_wind_speed']

# 入力データとターゲットを分ける
X = train_data[features]
y = train_data['bikes_available']

# Step 5: モデルの訓練
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# モデルの評価
y_pred = model.predict(X_test)
print('RMSE:', mean_squared_error(y_test, y_pred, squared=False))

# Step 6: 予測の実行
# predict が 1 のデータに対して bikes_available を予測
test_data = merged_data[merged_data['predict'] == 1]
X_test_predict = test_data[features]

test_data['predicted_bikes_available'] = model.predict(X_test_predict)

# 予測結果を確認
print(test_data[['year', 'month', 'day', 'hour', 'station_id', 'predicted_bikes_available']])

# 必要であれば結果を保存
test_data[['year', 'month', 'day', 'hour', 'station_id', 'predicted_bikes_available']].to_csv('/home-asustor/teramoto/SIGNATE/SOTA_Challenge/Share_Cycle/data/predicted_bikes_available.csv', index=False)


RMSE: 3.770713420592861


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_bikes_available'] = model.predict(X_test_predict)


         year  month  day  hour  station_id  predicted_bikes_available
8761     2014      9    1     1           0                   8.631628
8762     2014      9    1     2           0                   8.631628
8763     2014      9    1     3           0                   8.631628
8764     2014      9    1     4           0                   8.631628
8765     2014      9    1     5           0                   8.631628
...       ...    ...  ...   ...         ...                        ...
1226347  2015      8   29    19          69                   8.550921
1226348  2015      8   29    20          69                   8.550921
1226349  2015      8   29    21          69                   8.550921
1226350  2015      8   29    22          69                   8.550921
1226351  2015      8   29    23          69                   8.550921

[193200 rows x 6 columns]


In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Step 1: データを読み込む
status_data = pd.read_csv('/home-asustor/teramoto/SIGNATE/SOTA_Challenge/Share_Cycle/data/status.csv')
trip_data = pd.read_csv('/home-asustor/teramoto/SIGNATE/SOTA_Challenge/Share_Cycle/data/trip.csv')
weather_data = pd.read_csv('/home-asustor/teramoto/SIGNATE/SOTA_Challenge/Share_Cycle/data/weather.csv')

# Step 2: 必要なカラムの抽出と処理
# 天気データの日付と時間を抽出
weather_data['date'] = pd.to_datetime(weather_data['date'])
weather_data['year'] = weather_data['date'].dt.year
weather_data['month'] = weather_data['date'].dt.month
weather_data['day'] = weather_data['date'].dt.day

# trip_data の特徴量を作成
trip_data['start_date'] = pd.to_datetime(trip_data['start_date'])
trip_data['year'] = trip_data['start_date'].dt.year
trip_data['month'] = trip_data['start_date'].dt.month
trip_data['day'] = trip_data['start_date'].dt.day
trip_data['hour'] = trip_data['start_date'].dt.hour

# 出発数と到着数のカウント
start_count = trip_data.groupby(['year', 'month', 'day', 'hour', 'start_station_id']).size().reset_index(name='start_number')
end_count = trip_data.groupby(['year', 'month', 'day', 'hour', 'end_station_id']).size().reset_index(name='end_number')

# Step 3: データのマージ
# status_data と weather_data のマージ
merged_data = pd.merge(status_data, weather_data, on=['year', 'month', 'day'], how='left')

# start_count と end_count を merged_data にマージ
merged_data = pd.merge(merged_data, start_count, left_on=['year', 'month', 'day', 'hour', 'station_id'], right_on=['year', 'month', 'day', 'hour', 'start_station_id'], how='left')
merged_data = pd.merge(merged_data, end_count, left_on=['year', 'month', 'day', 'hour', 'station_id'], right_on=['year', 'month', 'day', 'hour', 'end_station_id'], how='left')

# 欠損値を0で埋める
merged_data.fillna(0, inplace=True)

# Step 4: モデル訓練のためのデータ作成
# predict が 0 のデータを使って学習データを作成
train_data = merged_data[merged_data['predict'] == 0]

# 使用する特徴量
features = ['start_number', 'end_number', 'mean_temperature', 'mean_humidity', 'mean_wind_speed']

# 入力データとターゲットを分ける
X = train_data[features]
y = train_data['bikes_available']

# Step 5: モデルの訓練
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# モデルの評価
y_pred = model.predict(X_test)
print('RMSE:', mean_squared_error(y_test, y_pred, squared=False))

# Step 6: 予測の実行
# predict が 1 のデータに対して bikes_available を予測
test_data = merged_data[merged_data['predict'] == 1]
X_test_predict = test_data[features]

test_data['predicted_bikes_available'] = model.predict(X_test_predict)

# Step 7: predicted_bikes_available を整数に変換
test_data['predicted_bikes_available'] = test_data['predicted_bikes_available'].round().astype(int)

# Step 8: id と predicted_bikes_available のみを保存、ヘッダーなし
output_data = test_data[['id', 'predicted_bikes_available']]
output_data.to_csv('/home-asustor/teramoto/SIGNATE/SOTA_Challenge/Share_Cycle/data/predicted_bikes_available.csv', index=False, header=False)

# 保存後の確認（最初の数行を表示）
print(output_data.head())


RMSE: 3.770713420592861


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_bikes_available'] = model.predict(X_test_predict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_bikes_available'] = test_data['predicted_bikes_available'].round().astype(int)


        id  predicted_bikes_available
8761  8761                          9
8762  8762                          9
8763  8763                          9
8764  8764                          9
8765  8765                          9
