In [6]:
from google.colab import drive
drive.mount('/content/drive')

import os, glob

# المسار الصحيح (فيه مسافة بين Bloom و Watch)
path = "/content/drive/MyDrive/Bloom Watch/usa_11_regional_weekly.csv"

print("Exists? ", os.path.exists(path))
!ls -lh "/content/drive/MyDrive/Bloom Watch/" | sed -n '1,20p'  # يعرض محتويات المجلد

# لو still False، دوري عليه تلقائيًا بالاسم:
matches = glob.glob('/content/drive/MyDrive/**/*usa_11*weekly*.csv', recursive=True)
print("Candidates found:", matches[:10])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Exists?  True
total 153M
-rw------- 1 root root 153M Oct  1 14:30 usa_11_regional_weekly.csv
Candidates found: ['/content/drive/MyDrive/Bloom Watch/usa_11_regional_weekly.csv']


In [7]:
import pandas as pd

CLIMATE_PATH = path

# جرّبي محرك pyarrow أولاً (سريع). لو مش متاح، هيرجع للمحرك الافتراضي.
try:
    df = pd.read_csv(CLIMATE_PATH, engine="pyarrow")
except Exception as e:
    print("pyarrow not used ->", e)
    df = pd.read_csv(CLIMATE_PATH, low_memory=False)

# تأكيد نوع السنة والفلترة
df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64')
df = df.dropna(subset=['Year']).copy()
df['Year'] = df['Year'].astype(int)

df_2013on = df[df['Year'] >= 2013].reset_index(drop=True)

print("All years shape:", df.shape, " |  >=2013 shape:", df_2013on.shape)
print("Years kept:", sorted(df_2013on['Year'].unique()))
df_2013on.head(2)


All years shape: (6240, 1618)  |  >=2013 shape: (1600, 1618)
Years kept: [np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022)]


Unnamed: 0,Unnamed: 1,region,Year,lat,lng,alt,T2M_1,T2M_2,T2M_3,T2M_4,...,ET0_43,ET0_44,ET0_45,ET0_46,ET0_47,ET0_48,ET0_49,ET0_50,ET0_51,ET0_52
0,4640,usa_11,2013,34.25,-124.75,0.0,13.36375,12.487143,12.408571,13.101429,...,6.668158,8.902141,7.834474,8.136691,7.376452,7.561631,7.634166,6.875637,7.645561,5.986468
1,4641,usa_11,2013,34.25,-124.25,0.0,13.31375,12.511429,12.522857,13.207143,...,6.675981,8.766821,7.763065,7.898457,7.047861,7.252497,7.59804,6.834809,7.521547,6.002999


In [8]:
out_dir = "/content/drive/MyDrive/Bloom Watch/processed"
os.makedirs(out_dir, exist_ok=True)
out_path = f"{out_dir}/usa_11_2013on.csv"

df_2013on.to_csv(out_path, index=False)
!ls -lh "$out_path"
print("Saved to:", out_path)


-rw------- 1 root root 40M Oct  2 04:10 '/content/drive/MyDrive/Bloom Watch/processed/usa_11_2013on.csv'
Saved to: /content/drive/MyDrive/Bloom Watch/processed/usa_11_2013on.csv


In [14]:
import pandas as pd

# اقرأ الملف الكبير (من درايف)
path = "/content/drive/MyDrive/Bloom Watch/usa_11_regional_weekly.csv"
df = pd.read_csv(path, low_memory=False)

# فلترة السنين >= 2013
df = df[df['Year'] >= 2013].reset_index(drop=True)

# حدد الأعمدة اللي عايزاها
id_cols = ['region','Year','lat','lng','alt']

# الأعمدة من الأسبوع 5 (فبراير تقريبًا) لحد الأسبوع 26 (يونيو)
week_cols = []
for col in df.columns:
    if any(col.endswith(f"_{i}") for i in range(5,27)):   # من 5 لحد 26
        week_cols.append(col)

# اجمع الأعمدة المطلوبة
df_feb_jun = df[id_cols + week_cols]

print("شكل الداتا:", df_feb_jun.shape)
df_feb_jun.head()


شكل الداتا: (1600, 687)


Unnamed: 0,region,Year,lat,lng,alt,T2M_5,T2M_6,T2M_7,T2M_8,T2M_9,...,ET0_17,ET0_18,ET0_19,ET0_20,ET0_21,ET0_22,ET0_23,ET0_24,ET0_25,ET0_26
0,usa_11,2013.0,34.25,-124.75,0.0,12.137143,12.02,11.998571,11.287143,11.824286,...,7.108712,9.313808,7.822312,10.365467,9.982298,10.725263,8.203957,10.154304,11.056842,10.430807
1,usa_11,2013.0,34.25,-124.25,0.0,12.162857,11.972857,11.98,11.308571,11.827143,...,7.133128,9.246187,7.967451,10.419891,9.986363,10.721997,8.011925,10.039177,11.060167,10.450535
2,usa_11,2013.0,34.25,-123.75,0.0,12.182857,11.872857,11.82,11.25,11.797143,...,7.430874,9.108749,8.472792,10.483122,10.192069,10.737365,7.425304,9.812467,11.130564,10.497341
3,usa_11,2013.0,34.25,-123.25,0.0,12.162857,11.721429,11.701429,11.182857,11.787143,...,7.51874,9.074389,8.684303,10.535116,10.141364,10.633518,7.311439,9.586233,11.063182,10.330264
4,usa_11,2013.0,34.25,-122.75,0.0,12.192857,11.628571,11.745714,11.215714,11.894286,...,7.853596,8.965057,8.907087,10.592386,10.478814,10.420541,7.06082,9.644226,11.010072,10.25674


In [15]:
import os

# اسم الملف الناتج
out_dir = "/content/drive/MyDrive/Bloom Watch/processed"
os.makedirs(out_dir, exist_ok=True)

out_file = f"{out_dir}/usa11_2013_FebJun_WIDE.csv"

# حفظ نسخة على Google Drive
df_feb_jun.to_csv(out_file, index=False)
print("Saved to Drive:", out_file)

# حفظ نسخة محلية للتحميل المباشر
local_file = "/content/usa11_2013_FebJun_WIDE.csv"
df_feb_jun.to_csv(local_file, index=False)

# تحميل الملف مباشرة
from google.colab import files
files.download(local_file)


Saved to Drive: /content/drive/MyDrive/Bloom Watch/processed/usa11_2013_FebJun_WIDE.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
import pandas as pd

# هنستخدم الداتا المفلترة اللي فيها من 2013 لحد 6 (يونيو)
# df_feb_jun = بياناتك بعد الفلترة (wide)

id_cols = ['region','Year']  # الحاجات اللي نحتفظ بيها
value_cols = [c for c in df_feb_jun.columns if c not in id_cols+['lat','lng','alt']]

# 1) ناخد المتوسط عبر كل النقاط (lat/lng مختلفة) جوا كل سنة
df_avg = (df_feb_jun
          .groupby(['region','Year'])[value_cols]
          .mean()
          .reset_index())

print("الشكل الجديد:", df_avg.shape)
df_avg.head()


الشكل الجديد: (10, 684)


Unnamed: 0,region,Year,T2M_5,T2M_6,T2M_7,T2M_8,T2M_9,T2M_10,T2M_11,T2M_12,...,ET0_17,ET0_18,ET0_19,ET0_20,ET0_21,ET0_22,ET0_23,ET0_24,ET0_25,ET0_26
0,usa_11,2013.0,8.552875,7.112848,8.693527,6.503661,9.677884,8.189571,12.519375,10.568804,...,8.457467,9.417206,8.132639,10.06768,9.979793,10.443776,9.902529,10.623353,10.893072,10.87028
1,usa_11,2014.0,9.212161,8.032893,11.627045,10.856196,10.834634,11.897813,12.338429,11.659009,...,8.186542,9.736221,9.068104,10.550656,9.241726,10.842727,10.415636,10.808921,11.033112,11.390742
2,usa_11,2015.0,11.529054,13.044455,13.238214,12.036295,9.901687,11.104545,14.250196,14.006482,...,7.923368,9.268025,8.363763,8.1236,8.034412,9.109137,9.450163,9.463978,11.686279,10.956639
3,usa_11,2016.0,7.649866,12.889402,11.535768,12.399536,12.708714,10.236286,12.550455,11.564821,...,8.46122,6.805892,8.970114,9.34688,9.068754,10.123995,10.048,10.447644,12.351965,11.025886
4,usa_11,2017.0,9.964063,10.605286,9.972884,6.978857,7.907616,10.969304,13.847304,10.947304,...,8.921336,9.071349,8.860767,9.134633,8.763019,10.061426,9.880747,11.11315,11.049538,11.167872


In [17]:
out_file = "/content/usa11_2013_FebJun_weekly_AVG.csv"
df_avg.to_csv(out_file, index=False)

from google.colab import files
files.download(out_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
import re
import pandas as pd
from datetime import timedelta

# لو محتاجة تقري df_avg من ملف:
# df_avg = pd.read_csv("/content/usa11_2013_FebJun_weekly_AVG.csv")

# 1) melt: نفصل (metric, week)
WEEK_RE = re.compile(r'^(?P<metric>.+?)_(?P<wk>\d{1,2})$')
id_cols = ['region','Year']
value_cols = [c for c in df_avg.columns if c not in id_cols]

long = df_avg.melt(id_vars=id_cols, value_vars=value_cols,
                   var_name='var', value_name='value')
parts = long['var'].str.extract(WEEK_RE)
long['metric'] = parts['metric']
long['week']   = parts['wk'].astype('Int64')
long = long.dropna(subset=['week']).drop(columns=['var'])
long['Year'] = long['Year'].astype(int)

# 2) نقي الأسابيع المطلوبة فقط (5 → 26 = من فبراير إلى يونيو)
long = long[(long['week'] >= 5) & (long['week'] <= 26)]

# 3) حوّل (Year, week) إلى تاريخ الجمعة لكل أسبوع
# ISO: %G (سنة ISO), %V (أسبوع ISO), %u (رقم اليوم 1=Mon … 5=Fri)
long['iso_str'] = long.apply(lambda r: f"{int(r['Year']):04d}-W{int(r['week']):02d}-5", axis=1)
long['date'] = pd.to_datetime(long['iso_str'], format='%G-W%V-%u', errors='coerce')
long = long.drop(columns=['iso_str'])

# 4) Pivot: صف واحد لكل (Year, week, date) والمقاييس تبقى أعمدة
weekly = (long
          .pivot_table(index=['region','Year','week','date'],
                       columns='metric', values='value')
          .reset_index())

# 5) اختيار ترتيب واضح
weekly = weekly.sort_values(['Year','week']).reset_index(drop=True)

weekly.head(10)


metric,region,Year,week,date,ALLSKY_SFC_LW_DWN,ALLSKY_SFC_PAR_TOT,ALLSKY_SFC_SW_DWN,ALLSKY_SRF_ALB,AOD_55,CDD18_3,...,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,TO3,VAP,VPD,WD2M,WS2M,Z0M
0,usa_11,2013,5,2013-02-01,289.118071,59.863,11.97125,0.129643,0.055357,4.5e-05,...,3.408116,5.980455,12.75733,5.508964,293.759196,1.152817,1.151943,237.133107,3.227634,0.189268
1,usa_11,2013,6,2013-02-08,280.3445,64.28275,12.946571,0.131393,0.053643,0.0,...,2.023009,4.567839,11.145554,4.091295,314.104205,1.053436,1.052634,270.798536,3.680723,0.190286
2,usa_11,2013,7,2013-02-15,275.704536,76.243571,15.454214,0.133821,0.06175,0.000411,...,2.062589,5.377804,13.542714,4.995509,304.910295,1.166708,1.16589,208.092812,2.758304,0.190929
3,usa_11,2013,8,2013-02-22,277.688071,75.277714,15.123107,0.127929,0.075286,0.0,...,1.08533,3.794661,10.539232,3.286786,330.923304,1.007404,1.006652,275.196955,4.608384,0.192062
4,usa_11,2013,9,2013-03-01,285.348714,83.46225,16.661571,0.128429,0.095786,0.00725,...,2.988455,6.333125,14.448884,5.966259,293.745277,1.23607,1.2352,241.642179,3.3185,0.193464
5,usa_11,2013,10,2013-03-08,291.744786,81.096214,16.073786,0.128536,0.115214,0.0,...,3.208857,5.699393,12.276937,4.976402,345.291063,1.116535,1.115681,246.458661,3.70492,0.194545
6,usa_11,2013,11,2013-03-15,301.282107,93.120821,18.476321,0.124714,0.107286,0.161464,...,4.888866,8.704205,17.786527,8.283759,279.219027,1.478649,1.477677,262.713295,3.592304,0.195411
7,usa_11,2013,12,2013-03-22,286.473464,99.1425,19.669464,0.12875,0.117643,0.001679,...,3.203705,6.885991,15.410357,6.506866,291.943509,1.299482,1.298607,269.305313,3.887393,0.19608
8,usa_11,2013,13,2013-03-29,318.151857,92.884643,17.987964,0.123393,0.143143,0.037821,...,6.098018,9.065643,16.528268,8.275241,319.380232,1.42779,1.426741,216.344848,2.509937,0.196768
9,usa_11,2013,14,2013-04-05,317.913607,99.041036,19.157,0.119607,0.149,0.01842,...,7.081625,9.721063,16.924679,8.63617,316.695732,1.457499,1.456387,262.026839,2.91775,0.197473


In [22]:
years = sorted(weekly['Year'].unique().tolist())
full_index = pd.MultiIndex.from_product(
    [ [weekly['region'].iloc[0]], years, list(range(5,27)) ],
    names=['region','Year','week']
)
# حساب تاريخ الجمعة لكل أسبوع
full_df = (pd.DataFrame(index=full_index)
           .reset_index())
full_df['date'] = pd.to_datetime(
    full_df.apply(lambda r: f"{int(r['Year']):04d}-W{int(r['week']):02d}-5", axis=1),
    format='%G-W%V-%u', errors='coerce'
)

weekly_full = (full_df
               .merge(weekly, on=['region','Year','week','date'], how='left')
               .sort_values(['Year','week'])
               .reset_index(drop=True))

weekly_to_save = weekly_full  # أو weekly لو مش عايزة ملء الأسابيع الناقصة


In [23]:
from google.colab import files
import os

out_dir = "/content/drive/MyDrive/Bloom Watch/processed"
os.makedirs(out_dir, exist_ok=True)

out_path = f"{out_dir}/usa11_weekly_AVG_FridayRows_2013_2013-202x.csv"
(weekly_to_save if 'weekly_to_save' in globals() else weekly).to_csv(out_path, index=False)
print("Saved:", out_path)

local = "/content/usa11_weekly_AVG_FridayRows.csv"
(weekly_to_save if 'weekly_to_save' in globals() else weekly).to_csv(local, index=False)
files.download(local)


Saved: /content/drive/MyDrive/Bloom Watch/processed/usa11_weekly_AVG_FridayRows_2013_2013-202x.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Two Files

In [27]:
import pandas as pd, numpy as np, os, glob

# ================== 1) مسارات الملفات ==================
bloom_path = "/content/drive/MyDrive/Bloom Watch/Bloom_Calendar.csv"
clim_path  = sorted(glob.glob("/content/drive/MyDrive/Bloom Watch/processed/*FridayRows*.csv"))[-1]
print("Bloom :", bloom_path)
print("Climate weekly (Fri rows):", clim_path)

# ================== 2) تحميل وتنضيف ==================
# Bloom
blooms = pd.read_csv(bloom_path)
# اعرف أعمدة التاريخ/المرحلة حتى لو اسمهم متغير بسيط
date_col  = [c for c in blooms.columns if c.strip().lower()=='date'][0]
stage_col = [c for c in blooms.columns if 'bloom' in c.lower() and 'stage' in c.lower()][0]
blooms = blooms.rename(columns={date_col:'Date', stage_col:'Bloom Stage'})
blooms['Date'] = pd.to_datetime(blooms['Date'], errors='coerce')
blooms = blooms.dropna(subset=['Date']).copy()
blooms['Year'] = blooms['Date'].dt.year.astype(int)
blooms = (blooms.sort_values(['Year','Date'])
                 .drop_duplicates(subset=['Year','Date'], keep='first')
                 .reset_index(drop=True))

# Climate weekly (صف لكل أسبوع/جمعة ومقاييس كأعمدة)
clim = pd.read_csv(clim_path, low_memory=False)
clim['date'] = pd.to_datetime(clim['date'], errors='coerce')
clim['Year'] = pd.to_numeric(clim['Year'], errors='coerce').astype('Int64')
clim = clim.dropna(subset=['Year','date']).copy()
clim['Year'] = clim['Year'].astype(int)
# نضمن أسابيع 5→26 فقط
if 'week' in clim.columns:
    clim = clim[(clim['week']>=5) & (clim['week']<=26)].copy()

# ================== 3) بناء سقالة 2013→2025، أسابيع 5→26، تاريخ الجمعة ==================
yr_min = min(2013, int(clim['Year'].min()) if not clim.empty else 2013,
             int(blooms['Year'].min()) if not blooms.empty else 2013)
yr_max = max(int(clim['Year'].max()) if not clim.empty else 2025,
             int(blooms['Year'].max()) if not blooms.empty else 2025, 2025)

scaf = (pd.MultiIndex.from_product([['usa_11'], list(range(yr_min, yr_max+1)), list(range(5,27))],
                                   names=['region','Year','week'])
        .to_frame(index=False))

# احسبي تاريخ الجمعة لكل سنة/أسبوع (ISO: %G-%V-%u ، والجمعة = 5)
scaf['date'] = pd.to_datetime(
    scaf.apply(lambda r: f"{int(r['Year']):04d}-W{int(r['week']):02d}-5", axis=1),
    format='%G-W%V-%u', errors='coerce'
)

# ================== 4) دمج المناخ كـ LEFT JOIN على السقالة ==================
# هنحدد أعمدة المناخ (كل حاجة غير المفاتيح)
key_cols_scaf = ['region','Year','week','date']
clim_keys     = ['region','Year','week','date']
clim_value_cols = [c for c in clim.columns if c not in clim_keys]  # المقاييس

scaf_clim = scaf.merge(clim[clim_keys + clim_value_cols], on=clim_keys, how='left')

# ================== 5) دمج Bloom بأقرب تاريخ (merge_asof) مع السقالة ==================
blooms_ren = blooms.rename(columns={'Date':'bloom_date','Bloom Stage':'bloom_stage'})
scaf_clim = scaf_clim.sort_values(['Year','date'])
blooms_ren = blooms_ren.sort_values(['Year','bloom_date'])

merged = pd.merge_asof(
    scaf_clim,
    blooms_ren[['Year','bloom_date','bloom_stage']].sort_values(['Year','bloom_date']),
    left_on='date',
    right_on='bloom_date',
    by='Year',
    direction='nearest',
    tolerance=pd.Timedelta('6D')  # عدّليها حسب رغبتك
)

# فرق الأيام (موجب = تاريخ السقالة بعد Bloom)
merged['days_from_bloom'] = (merged['date'] - merged['bloom_date']).dt.days

# ================== 6) توحيد التاريخ unified_date ==================
# سياسات: "climate" (تاريخ الجمعة) أو "bloom" أو "earlier"
prefer_policy = "earlier"
def pick_unified(d, b, mode="earlier"):
    if pd.isna(d) and pd.isna(b): return pd.NaT
    if pd.isna(b): return d
    if pd.isna(d): return b
    if mode=="climate": return d
    if mode=="bloom":   return b
    return min(d, b)

merged['unified_date'] = [pick_unified(d, b, prefer_policy) for d,b in zip(merged['date'], merged['bloom_date'])]
merged['matched_within_tol'] = merged['bloom_date'].notna()

# ================== 7) ترتيب/حفظ ==================
out_cols_first = ['region','Year','week','date','bloom_date','bloom_stage',
                  'unified_date','days_from_bloom','matched_within_tol']
other_cols = [c for c in merged.columns if c not in out_cols_first]
out = merged[out_cols_first + other_cols].sort_values(['Year','week']).reset_index(drop=True)

out_dir = "/content/drive/MyDrive/Bloom Watch/processed"
os.makedirs(out_dir, exist_ok=True)
out_path = f"{out_dir}/SCAFFOLD_2013_2025_week5_26_Fri_CLIMATExBLOOM.csv"
out.to_csv(out_path, index=False)
print("Saved:", out_path)

# نسخة للتحميل السريع
local = "/content/SCAFFOLD_2013_2025_week5_26_Fri_CLIMATExBLOOM.csv"
out.to_csv(local, index=False)
from google.colab import files; files.download(local)

# ملخص سريع
print({
    "years": [yr_min, yr_max],
    "rows_total": len(out),
    "rows_with_climate": int(out[~out[other_cols].isna().all(axis=1)].shape[0]),
    "rows_with_bloom": int(out['matched_within_tol'].sum()),
    "note": "سنين 2023–2025 ستظهر بقيم مناخ NaN (مقصودة) مع وجود Bloom حيث يتاح."
})


Bloom : /content/drive/MyDrive/Bloom Watch/Bloom_Calendar.csv
Climate weekly (Fri rows): /content/drive/MyDrive/Bloom Watch/processed/usa11_weekly_AVG_FridayRows_2013_2013-202x.csv
Saved: /content/drive/MyDrive/Bloom Watch/processed/SCAFFOLD_2013_2025_week5_26_Fri_CLIMATExBLOOM.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'years': [2013, 2025], 'rows_total': 286, 'rows_with_climate': 220, 'rows_with_bloom': 164, 'note': 'سنين 2023–2025 ستظهر بقيم مناخ NaN (مقصودة) مع وجود Bloom حيث يتاح.'}


## To Complete Data

In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

# --- اقرأ الداتا المدمجة ---
path = "/content/drive/MyDrive/Bloom Watch/processed/SCAFFOLD_2013_2025_week5_26_Fri_CLIMATExBLOOM.csv"
df = pd.read_csv(path, parse_dates=['date','bloom_date','unified_date'])

# نشتغل على المناخ اللي متسجل (<=2022)
train = df[df['Year']<=2022].copy()
test  = df[df['Year']>=2023].copy()   # دي اللي عايزين نتوقعها

# الهدف: T2M (مثال)
target = 'T2M'
features = ['week','bloom_stage']

# --- تجهيز Features ---
X_train = train[features].copy()
y_train = train[target].copy()

X_pred  = test[features].copy()

# OneHot لـ Bloom Stage
preproc = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['bloom_stage']),
        ('num', 'passthrough', ['week'])
    ]
)

# --- الموديل ---
rf = Pipeline(steps=[
    ('preprocess', preproc),
    ('model', RandomForestRegressor(n_estimators=300, random_state=42))
])

# تدريب
rf.fit(X_train, y_train)

# تنبؤ على التدريب (للتقييم)
train_pred = rf.predict(X_train)
print("MAE على التدريب:", mean_absolute_error(y_train, train_pred))

# تنبؤ على المستقبل
test[f'{target}_pred'] = rf.predict(X_pred)

# نشوف عينات
test[['Year','week','bloom_stage',f'{target}_pred']].head(10)


MAE على التدريب: 1.2539709558029042


Unnamed: 0,Year,week,bloom_stage,T2M_pred
220,2023,5,,9.738599
221,2023,6,,9.850088
222,2023,7,,9.876755
223,2023,8,,8.492269
224,2023,9,SoB / Superbloom,9.566277
225,2023,10,SoB / Superbloom,8.901578
226,2023,11,SoB,11.376442
227,2023,12,PoB,10.958054
228,2023,13,PoB,11.544496
229,2023,14,PoB,12.006269


In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

features = ['week','bloom_stage']
targets  = ['T2M','T2M_MAX','T2M_MIN','PRECTOTCORR','RH2M','VPD']   # عدّلي اللي موجود فعلاً في جدولك

train = df[df['Year']<=2022].copy()
test  = df[df['Year']>=2023].copy()

X_tr, y_tr = train[features], train[targets]
X_te        = test[features]

pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['bloom_stage']),
    ('num', 'passthrough', ['week'])
])

model = MultiOutputRegressor(RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1))
pipe  = Pipeline([('pre', pre), ('rf', model)])
pipe.fit(X_tr, y_tr)

pred = pipe.predict(X_te)
pred_df = pd.DataFrame(pred, columns=[f'{t}_pred' for t in targets])

out = pd.concat([test[['Year','date','week','bloom_stage']].reset_index(drop=True), pred_df], axis=1)
out = out.sort_values(['Year','week']).reset_index(drop=True)

# حفظ
out_path = "/content/drive/MyDrive/Bloom Watch/processed/PRED_weather_from_bloom_2023_2025_multi.csv"
out.to_csv(out_path, index=False)
print("Saved:", out_path)


Saved: /content/drive/MyDrive/Bloom Watch/processed/PRED_weather_from_bloom_2023_2025_multi.csv


# **Final Data**

In [30]:
import pandas as pd, numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from pathlib import Path

# ---------- 1) اقرأ الجدول الموحّد ----------
BASE = "/content/drive/MyDrive/Bloom Watch/processed"
SCF  = f"{BASE}/SCAFFOLD_2013_2025_week5_26_Fri_CLIMATExBLOOM.csv"
df   = pd.read_csv(SCF, parse_dates=['date','bloom_date','unified_date'])

# تنظيف أساسيات
df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64')
df = df.dropna(subset=['Year','week']).copy()
df['Year'] = df['Year'].astype(int)
df['bloom_stage'] = df['bloom_stage'].fillna('NoBloom')
df['month'] = df['date'].dt.month
df['doy']   = df['date'].dt.dayofyear

# ---------- 2) حدد المتغيرات المناخية المتاحة ----------
# غيّري القائمة حسب أعمدتك الموجودة فعليًا
candidates = [
    'T2M','T2M_MAX','T2M_MIN','PRECTOTCORR','RH2M','VPD',
    'ET0','QV2M','ALLSKY_SFC_SW_DWN','PS'
]
targets = [c for c in candidates if c in df.columns]
assert len(targets)>0, "لا يوجد أعمدة مناخ مطابقة في الجدول."

features = ['week','month','doy','bloom_stage']

# ---------- 3) قسّمي تدريب/تنبؤ ----------
train = df[df['Year']<=2022].copy()
future= df[df['Year']>=2023].copy()

X_tr, y_tr = train[features], train[targets]
X_fu        = future[features]

# ---------- 4) بايبلاين تدريب (OneHot للبلم ستيج + RF ريجريشن متعدد المخرجات) ----------
pre  = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['bloom_stage']),
    ('num', 'passthrough', ['week','month','doy'])
])

model = MultiOutputRegressor(RandomForestRegressor(
    n_estimators=400, random_state=42, n_jobs=-1, min_samples_leaf=2
))
pipe  = Pipeline([('pre', pre), ('rf', model)])

pipe.fit(X_tr, y_tr)

# ---------- 5) توقّعات 2023–2025 ----------
pred_fu = pipe.predict(X_fu)
pred_cols = [f"{t}_pred" for t in targets]
future_pred = future[['region','Year','week','date','bloom_stage']].reset_index(drop=True).copy()
future_pred[pred_cols] = pd.DataFrame(pred_fu, columns=pred_cols)

# ---------- 6) ركّب جدول موحّد بكل السنين ----------
# للحقيقة حتى 2022
train_out = train[['region','Year','week','date','bloom_stage'] + targets].copy()
# أعد تسميات "حقيقية" لتتوافق مع شكل النهائي
for t in targets:
    train_out[f"{t}_pred"] = np.nan   # في الماضي مش بنحتاج توقع

# ضمّ الماضي مع المستقبل المتوقّع
master = pd.concat([
    train_out,
    future_pred
], ignore_index=True).sort_values(['Year','week']).reset_index(drop=True)

# ---------- 7) عمود نهائي لكل متغيّر (حقيقي إن وجد وإلا توقع) ----------
for t in targets:
    master[f"{t}_final"] = master[t].where(~master[t].isna(), master[f"{t}_pred"])

# ترتيب أعمدة لطيف
order = ['region','Year','week','date','bloom_stage']
order += sum(([t, f"{t}_pred", f"{t}_final"] for t in targets), [])
master = master[order]

# ---------- 8) حفظ للتصدير ----------
out_path = Path(BASE)/"MASTER_WeatherFromBloom_2013_2025_full.csv"
master.to_csv(out_path, index=False)
print("Saved:", out_path)

# نسخة للتحميل السريع
local = "/content/MASTER_WeatherFromBloom_2013_2025_full.csv"
master.to_csv(local, index=False)
from google.colab import files; files.download(local)

# ملخص
print({
    "years": [master['Year'].min(), master['Year'].max()],
    "rows": len(master),
    "targets": targets[:],
    "sample_cols": master.columns[:15].tolist()
})


Saved: /content/drive/MyDrive/Bloom Watch/processed/MASTER_WeatherFromBloom_2013_2025_full.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'years': [2013, 2025], 'rows': 286, 'targets': ['T2M', 'T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'RH2M', 'VPD', 'ET0', 'QV2M', 'ALLSKY_SFC_SW_DWN', 'PS'], 'sample_cols': ['region', 'Year', 'week', 'date', 'bloom_stage', 'T2M', 'T2M_pred', 'T2M_final', 'T2M_MAX', 'T2M_MAX_pred', 'T2M_MAX_final', 'T2M_MIN', 'T2M_MIN_pred', 'T2M_MIN_final', 'PRECTOTCORR']}


In [31]:
import pandas as pd
from pathlib import Path

BASE = "/content/drive/MyDrive/Bloom Watch/processed"
master_path = f"{BASE}/MASTER_WeatherFromBloom_2013_2025_full.csv"

df = pd.read_csv(master_path, parse_dates=['date'])

# خريطة الأسماء
rename_map = {
    'T2M': 'AirTemp_avg',
    'T2M_MAX': 'AirTemp_max',
    'T2M_MIN': 'AirTemp_min',
    'PRECTOTCORR': 'Precipitation',
    'RH2M': 'Humidity_rel',
    'VPD': 'VaporPressureDeficit',
    'ET0': 'Evapotranspiration_ref',
    'QV2M': 'SpecificHumidity',
    'ALLSKY_SFC_SW_DWN': 'SolarRadiation_sw',
    'PS': 'SurfacePressure'
}

# طبّق التغيير على كل الأعمدة (الحقيقية + pred + final)
new_cols = {}
for c in df.columns:
    for k,v in rename_map.items():
        if c.startswith(k):
            new_cols[c] = c.replace(k, v)
df = df.rename(columns=new_cols)

# حفظ النسخة الجديدة
out_path = Path(BASE)/"MASTER_WeatherFromBloom_2013_2025_full_readable.csv"
df.to_csv(out_path, index=False)

print("Saved:", out_path)

# نسخة للتحميل الفوري
local = "/content/MASTER_WeatherFromBloom_2013_2025_full_readable.csv"
df.to_csv(local, index=False)

from google.colab import files
files.download(local)

df.head(5)


Saved: /content/drive/MyDrive/Bloom Watch/processed/MASTER_WeatherFromBloom_2013_2025_full_readable.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,region,Year,week,date,bloom_stage,AirTemp_avg,AirTemp_avg_pred,AirTemp_avg_final,AirTemp_max,AirTemp_max_pred,...,Evapotranspiration_ref_final,SpecificHumidity,SpecificHumidity_pred,SpecificHumidity_final,SolarRadiation_sw,SolarRadiation_sw_pred,SolarRadiation_sw_final,SurfacePressure,SurfacePressure_pred,SurfacePressure_final
0,usa_11,2013,5,2013-02-01,NoBloom,8.552875,,8.552875,12.75733,,...,5.080673,0.005382,,0.005382,11.97125,,11.97125,95.887625,,95.887625
1,usa_11,2013,6,2013-02-08,NoBloom,7.112848,,7.112848,11.145554,,...,5.270334,0.004942,,0.004942,12.946571,,12.946571,95.316259,,95.316259
2,usa_11,2013,7,2013-02-15,NoBloom,8.693527,,8.693527,13.542714,,...,5.472606,0.005038,,0.005038,15.454214,,15.454214,95.779893,,95.779893
3,usa_11,2013,8,2013-02-22,NoBloom,6.503661,,6.503661,10.539232,,...,5.766278,0.004633,,0.004633,15.123107,,15.123107,95.330241,,95.330241
4,usa_11,2013,9,2013-03-01,SoB,9.677884,,9.677884,14.448884,,...,6.13364,0.00536,,0.00536,16.661571,,16.661571,95.908205,,95.908205




---



In [32]:
import pandas as pd
from pathlib import Path

BASE = "/content/drive/MyDrive/Bloom Watch/processed"
READABLE = f"{BASE}/MASTER_WeatherFromBloom_2013_2025_full_readable.csv"

df = pd.read_csv(READABLE, parse_dates=['date'])

# تأكيد الأعمدة الأساسية
df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64')
df = df.dropna(subset=['Year','week','date']).copy()
df['Year'] = df['Year'].astype(int)

# نضيف أعمدة مفيدة للتحليل
df['month'] = df['date'].dt.month
df['doy']   = df['date'].dt.dayofyear
df['bloom_stage'] = df['bloom_stage'].fillna('NoBloom')

# التعرّف على الأعمدة النهائية …_final
final_cols = [c for c in df.columns if c.endswith('_final')]

core_cols  = ['region','Year','week','date','bloom_stage','month','doy']
wide_final = df[core_cols + final_cols].sort_values(['Year','week']).reset_index(drop=True)

# حفظ النسخة العريضة (جاهزة للتحليل)
wide_path = Path(BASE)/"ANALYSIS_ready_WIDE_final.csv"
wide_final.to_csv(wide_path, index=False)
print("Saved wide:", wide_path)

# عمل نسخة طوليّة (Tidy) للرسم بسهولة
# هنحوّل …_final إلى عمودين: variable, value
long_final = wide_final.melt(
    id_vars=core_cols,
    value_vars=final_cols,
    var_name='variable',
    value_name='value'
)

# تنظيف اسم المتغيّر (نشيل لاحقة _final)
long_final['variable'] = long_final['variable'].str.replace('_final$', '', regex=True)

# ترتيب وحفظ
long_final = long_final.sort_values(['Year','week','variable']).reset_index(drop=True)
long_path = Path(BASE)/"ANALYSIS_ready_LONG_tidy.csv"
long_final.to_csv(long_path, index=False)
print("Saved long:", long_path)


Saved wide: /content/drive/MyDrive/Bloom Watch/processed/ANALYSIS_ready_WIDE_final.csv
Saved long: /content/drive/MyDrive/Bloom Watch/processed/ANALYSIS_ready_LONG_tidy.csv
