In [None]:
import boto3

bucket_name = "dataminds-homeworks"
s3_file_key = "data_usage_production.parquet"  # e.g. 'folder/myfile.txt'
local_file_path = "data_usage_production.parquet"  # Local destination

# Create an S3 client (remove `bucket_name` here — not a valid argument for boto3.client)
s3 = boto3.client(
    "s3",
    region_name="us-east-1",
    # aws_access_key_id='your_access_key',
    # aws_secret_access_key='your_secret_key'
)

# Download the file
try:
    s3.download_file(bucket_name, s3_file_key, local_file_path)
    print(
        f"✅ File downloaded successfully from s3://{bucket_name}/{s3_file_key} to {local_file_path}"
    )
except Exception as e:
    print("❌ Error downloading file:", e)

In [None]:
import pandas as pd
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

In [None]:
# df = pd.read_parquet('data_usage_production.parquet', engine='pyarrow')
parquet_file = pq.ParquetFile("data_usage_production.parquet")
column_names = parquet_file.schema.names
column_names

In [None]:
df = pd.read_parquet("data_usage_production.parquet")
df = df.drop(
    [
        "lasttariff_m2",
        "lasttariff_m3",
        "lasttariff_m4",
        "lasttariff_m5",
        "lasttariff_m6",
        "tariff_desc",
        "customer_status",
    ],
    axis=1,
)
df = df.sample(n=10000, random_state=42)
df.info()

In [None]:
df.set_index("telephone_number", inplace=True)
df.head()

In [None]:
X = df.drop("data_compl_usg_local_m1", axis=1)
y = df["data_compl_usg_local_m1"]

features = X.columns

In [None]:
numeric_transformer = Pipeline(
    [
        ("impute", SimpleImputer(strategy="mean")),
        ("scale", StandardScaler()),
    ]
)

preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, features)])

clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", RandomForestRegressor())])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_val)

error = mean_squared_error(y_pred, y_val)
print(error ** (1 / 2))