<div align="center">

## Projekt ML

</div>

In [8]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, asin, sqrt

df = pd.read_csv("deliverytime.csv")

target = "Time_taken(min)"

# 1) Korelacje dla liczbowych
num_cols = df.select_dtypes(include=[np.number]).columns
pearson = df[num_cols].corr(method="pearson")[target].sort_values(ascending=False)
spearman = df[num_cols].corr(method="spearman")[target].sort_values(ascending=False)

print("Pearson z targetem:\n", pearson, "\n")
print("Spearman z targetem:\n", spearman, "\n")

# 2) Haversine distance jako feature
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return R * c

dist_km = np.vectorize(haversine_km)(
    df["Restaurant_latitude"].values,
    df["Restaurant_longitude"].values,
    df["Delivery_location_latitude"].values,
    df["Delivery_location_longitude"].values,
)
df["distance_km"] = dist_km

print(df["distance_km"].describe())

# Pearson na pełnych danych (bez przycinania)
corr_raw = df["distance_km"].corr(df[target], method="pearson")
print("Corr(distance_km, time) - pełne dane (Pearson):", corr_raw)

# (opcjonalnie) Pearson po clip < 50 km dla porównania
df_clip = df[df["distance_km"] < 50].copy()
corr_clip = df_clip["distance_km"].corr(df_clip[target], method="pearson")
print("Corr(distance_km, time) - clip < 50 km (Pearson):", corr_clip)

print("Corr(distance_km, time) po clip < 50km:",
      df_clip[["distance_km", target]].corr().iloc[0, 1])


Pearson z targetem:
 Time_taken(min)                1.000000
Delivery_person_Age            0.292708
Delivery_location_latitude     0.014243
Restaurant_latitude            0.013981
Restaurant_longitude           0.007821
Delivery_location_longitude    0.007625
Delivery_person_Ratings       -0.331103
Name: Time_taken(min), dtype: float64 

Spearman z targetem:
 Time_taken(min)                1.000000
Delivery_person_Age            0.309447
Delivery_location_latitude     0.030495
Delivery_location_longitude    0.028469
Restaurant_latitude            0.014593
Restaurant_longitude           0.008518
Delivery_person_Ratings       -0.289439
Name: Time_taken(min), dtype: float64 

count    45593.000000
mean        99.303911
std       1099.731281
min          1.465067
25%          4.663493
50%          9.264281
75%         13.763977
max      19692.674606
Name: distance_km, dtype: float64
Corr(distance_km, time) - pełne dane (Pearson): -0.002508067141567969
Corr(distance_km, time) - clip < 50 k

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge

df = pd.read_csv("deliverytime.csv")
target = "Time_taken(min)"

X = df.drop(columns=[target])
y = df[target]

# (opcjonalnie) usuń identyfikatory
for col in ["ID", "Delivery_person_ID"]:
    if col in X.columns:
        X = X.drop(columns=[col])

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.columns.difference(cat_cols).tolist()

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ]
)

model = Pipeline(steps=[
    ("prep", preprocess),
    ("reg", Ridge(alpha=1.0)),
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

pred = model.predict(X_test)


array([24.6160045 , 27.50764286, 35.8727395 , ..., 20.7405941 ,
       24.60516044, 24.22219365], shape=(9119,))

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit