In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd



def none_or_one(pd_series):
    return pd_series/pd_series



def extract_features_v2(df):
    df["mjd_int"] = df["mjd"].astype(int)

    df = df[df["detected_bool"] == 1].groupby(["object_id", "mjd_int"])["flux"].max().reset_index()
    df["abs_flux"] = np.abs(df["flux"])
    for col in ["first", "last", "deep", "peak"]:
        df[col] = df["flux"].values

    
    df["mjd_min"] = df["mjd_int"].values
    df["mjd_max"] = df["mjd_int"].values
    max_flux = df.groupby("object_id")["flux"].transform("max")
    df["mjd_peak"] = df["mjd_int"] * (max_flux == df["flux"])
    df["mjd_deep"] = df["mjd_int"] * (df.groupby("object_id")["flux"].transform("min") == df["flux"])

    peak_time = df.groupby("object_id")["mjd_peak"].transform("max")
    period = ((df["mjd_int"] > peak_time) & (df["mjd_int"] < peak_time + 32)).astype(int)
    df["peak_32"] = (none_or_one(period) * df["flux"]) / max_flux

    df = df.groupby("object_id").agg({"abs_flux": "max", "first": "first", "last": "last", "mjd_int": "count",
                                      "peak": lambda ll: np.array(ll).argmax(),
                                      "deep": lambda ll: np.array(ll).argmin(),
                                      "mjd_min": "min", "mjd_max": "max", "mjd_peak": "max", "mjd_deep": "max",
                                      "peak_32": "min"}).reset_index()
    df["first"] /= df["abs_flux"]
    df["last"] /= df["abs_flux"]
    df["peak"] /= df["mjd_int"] - 1
    df["deep"] /= df["mjd_int"] - 1
    df["till_peak"] = df["mjd_peak"] - df["mjd_min"]
    df["after_peak"] = df["mjd_max"] - df["mjd_peak"]
    df["deep_peak"] = df["mjd_peak"] - df["mjd_deep"]

    extracted_features = ["first", "last", "peak", "deep", "till_peak", "after_peak", "deep_peak", "peak_32"]

    
    return df[["object_id"] + extracted_features]


if __name__ == "__main__":
    df = pd.read_csv('/data/plasticc_train_lightcurves.csv')
    extract_features_v2(df).to_csv('/data/features1.csv', index=False)

      object_id      abs_flux     first  ...  till_peak  after_peak  deep_peak
0           615    660.626343 -0.588812  ...        499         375        470
1           713     12.353376  0.616434  ...         77         770       -714
2           730     47.310059  0.989131  ...          3          76        -76
3           745    220.795212  1.000000  ...          0         124       -124
4          1124    143.600189  1.000000  ...          0         134       -134
5          1227     71.678154  0.112898  ...        692           0        692
6          1598   1448.715698  0.038177  ...         15           0         15
7          1632     16.413616  1.000000  ...          0           3         -3
8          1920    231.828339  0.088231  ...         15          86         -7
9          1926     30.658575  0.887638  ...        355          80        -80
10         2072     19.925980  1.000000  ...          0          99        -96
11         2103    214.805969  0.867524  ...        