# Task 1: Data Understanding I
Goal: Understand the structure and basic properties of the dataset using Python (NumPy, pandas, Matplotlib/Seaborn).

1) Read the CSV file with pandas.read_csv and parse the datetime
column.

In [None]:
import pandas as pd

day = pd.read_csv("day.csv")
hour = pd.read_csv("hour.csv")

day["dteday"] = pd.to_datetime(day["dteday"])
hour["dteday"] = pd.to_datetime(hour["dteday"])

hour["datetime"] = hour["dteday"] + pd.to_timedelta(hour["hr"], unit="h")

df_pre = pd.merge(
    hour,
    day,
    on="dteday",
    how="left",
    suffixes=("_hour", "_day")
)

df = pd.DataFrame(
    {
        "datetime": df_pre["datetime"],
        "target": df_pre["cnt_hour"],
        "weather": df_pre["weathersit_hour"],
        "temp": df_pre["temp_hour"],
        "humidity": df_pre["hum_hour"],
        "windspeed": df_pre["windspeed_hour"],
        "seaseon": df_pre["season_day"],
        "is_holiday": df_pre["holiday_day"],
        "is_workingday": df_pre["workingday_day"]
    }
)

2. Report:
- Number of rows and columns.
- Time range covered by the data.
- Target variable and list of feature variables (names and data types).

In [82]:

def dataframe_report(df: pd.DataFrame, target_col: str, time_col: str):
    print("=== DATAFRAME REPORT ===\n")

    # Rows & Columns
    n_rows, n_cols = df.shape
    print(f"Number of rows: {n_rows}")
    print(f"Number of columns: {n_cols}\n")

    # Time range
    df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
    start_date = df[time_col].min()
    end_date = df[time_col].max()
    print(f"Time range covered: {start_date} → {end_date}\n")

    # Target variable
    print("Target variable:")
    print(f"  - {target_col} ({df[target_col].dtype})\n")

    # Feature variables
    feature_cols = [col for col in df.columns if col not in [target_col, time_col]]
    print("Feature variables:")
    for col in feature_cols:
        print(f"  - {col}: {df[col].dtype}")

# Beispiel-Aufruf
dataframe_report(
    df=df,
    target_col="target",
    time_col="datetime"
)

=== DATAFRAME REPORT ===

Number of rows: 17379
Number of columns: 9

Time range covered: 2011-01-01 00:00:00 → 2012-12-31 23:00:00

Target variable:
  - target (int64)

Feature variables:
  - weather: int64
  - temp,: float64
  - humidity: float64
  - windspeed: float64
  - seaseon: int64
  - is_holiday: int64
  - is_workingday: int64


3. Create a variable description table (see above for reference).

In [83]:
def create_variable_description_table(
    df: pd.DataFrame,
    target_col: str,
    time_col: str
) -> pd.DataFrame:

    rows = []

    for col in df.columns:
        if col == time_col:
            role = "time"
        elif col == target_col:
            role = "target"
        else:
            role = "feature"

        rows.append({
            "variable_name": col,
            "role": role,
            "data_type": df[col].dtype.name,
        })

    return pd.DataFrame(rows)

# Example usage
var_table = create_variable_description_table(
    df=df,
    target_col="target",
    time_col="datetime"
)

var_table


Unnamed: 0,variable_name,role,data_type
0,datetime,time,datetime64[ns]
1,target,target,int64
2,weather,feature,int64
3,"temp,",feature,float64
4,humidity,feature,float64
5,windspeed,feature,float64
6,seaseon,feature,int64
7,is_holiday,feature,int64
8,is_workingday,feature,int64


4. Check for:
- Missing values per column.
- Duplicated rows (if any).

In [84]:
print(f'Number of duplicate rows: {df.duplicated().sum()}')
print(f'Number of missing values: {df.isna().sum().sum()}')

Number of duplicate rows: 0
Number of missing values: 0


In [87]:
df.columns
df[["target","temp","humidity","windspeed"]].describe()[["mean", "std", "min", "max"]]

KeyError: "['temp'] not in index"