In [1]:
import boto3

bucket_name = "dataminds-homeworks"
s3_file_key = "data_usage_production.parquet"  # e.g. 'folder/myfile.txt'
local_file_path = "data_usage_production.parquet"  # Local destination

# Create an S3 client (remove `bucket_name` here — not a valid argument for boto3.client)
s3 = boto3.client(
    "s3",
    region_name="us-east-1",
    # aws_access_key_id='your_access_key',
    # aws_secret_access_key='your_secret_key'
)

# Download the file
try:
    s3.download_file(bucket_name, s3_file_key, local_file_path)
    print(
        f"✅ File downloaded successfully from s3://{bucket_name}/{s3_file_key} to {local_file_path}"
    )
except Exception as e:
    print("❌ Error downloading file:", e)

✅ File downloaded successfully from s3://dataminds-homeworks/data_usage_production.parquet to data_usage_production.parquet


In [52]:
import pandas as pd
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

In [53]:
# df = pd.read_parquet('data_usage_production.parquet', engine='pyarrow')
parquet_file = pq.ParquetFile("data_usage_production.parquet")
column_names = parquet_file.schema.names
column_names

['telephone_number',
 'tariff_desc',
 'customer_status',
 'tenure',
 'data_compl_usg_local_m2',
 'data_amount_lte_m2',
 'data_pack_usg_m2',
 'dpi_https_and_default_m2',
 'data_pack_rev_local_m2',
 'refill_total_m2',
 'dpi_tik_tok_m2',
 'dpi_youtube_m2',
 'dpi_instagram_m2',
 'lms_rev_m2',
 'dpi_kabinetim_m2',
 'dpi_tcp_signaling_m2',
 'dpi_telegram_m2',
 'lastrefillamount_m2',
 'data_payg_rev_local_m2',
 'data_pack_usg_m3',
 'data_compl_usg_local_m3',
 'data_pack_rev_local_m3',
 'refill_total_m3',
 'dpi_tik_tok_m3',
 'dpi_tcp_signaling_m3',
 'data_payg_rev_local_m3',
 'data_pack_rev_local_m4',
 'refill_total_m4',
 'data_compl_usg_local_m4',
 'dpi_tik_tok_m4',
 'dpi_tcp_signaling_m4',
 'data_payg_rev_local_m4',
 'data_compl_usg_local_m5',
 'data_pack_rev_local_m5',
 'data_payg_rev_local_m5',
 'data_compl_usg_local_m6',
 'data_amount_lte_m6',
 'data_pack_usg_m6',
 'dpi_tcp_signaling_m6',
 'data_pack_rev_local_m6',
 'data_payg_rev_local_m6',
 'frequency',
 'recency',
 'tot_inact_status_da

In [54]:
df = pd.read_parquet("data_usage_production.parquet")
df = df.drop(
    [
        "lasttariff_m2",
        "lasttariff_m3",
        "lasttariff_m4",
        "lasttariff_m5",
        "lasttariff_m6",
        "tariff_desc",
        "customer_status",
    ],
    axis=1,
)
df = df.sample(n=10000, random_state=42)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 2506043 to 271642
Data columns (total 62 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   telephone_number              10000 non-null  object 
 1   tenure                        10000 non-null  int64  
 2   data_compl_usg_local_m2       10000 non-null  float64
 3   data_amount_lte_m2            10000 non-null  float64
 4   data_pack_usg_m2              10000 non-null  float64
 5   dpi_https_and_default_m2      10000 non-null  float64
 6   data_pack_rev_local_m2        10000 non-null  float64
 7   refill_total_m2               10000 non-null  float64
 8   dpi_tik_tok_m2                10000 non-null  float64
 9   dpi_youtube_m2                10000 non-null  float64
 10  dpi_instagram_m2              10000 non-null  float64
 11  lms_rev_m2                    10000 non-null  float64
 12  dpi_kabinetim_m2              10000 non-null  float64
 13 

In [55]:
df.set_index("telephone_number", inplace=True)
df.head()

Unnamed: 0_level_0,tenure,data_compl_usg_local_m2,data_amount_lte_m2,data_pack_usg_m2,dpi_https_and_default_m2,data_pack_rev_local_m2,refill_total_m2,dpi_tik_tok_m2,dpi_youtube_m2,dpi_instagram_m2,...,data_tariff_revenue_m2,data_from_tariff_m3,data_tariff_revenue_m3,data_from_tariff_m4,data_tariff_revenue_m4,data_from_tariff_m5,data_tariff_revenue_m5,data_from_tariff_m6,data_tariff_revenue_m6,data_compl_usg_local_m1
telephone_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B59kYFgOZg,4226,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZC_SHd_H7r,1212,10500.0,10497.0,0.0,36.0,14.0,20.0,10055.0,378.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12496.18
Iukgk3yWay,1384,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VzyD3j1Y3o,1475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02
psUqbRH5FC,2802,22.0,22.0,22.0,1.0,0.0,4.0,0.0,21.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02


In [63]:
X = df.drop("data_compl_usg_local_m1", axis=1)
y = df["data_compl_usg_local_m1"]

features = X.columns

In [73]:
numeric_transformer = Pipeline(
    [
        ("impute", SimpleImputer(strategy="mean")),
        ("scale", StandardScaler()),
    ]
)

preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, features)])

clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", RandomForestRegressor())])

In [74]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)

In [75]:
y_pred = clf.predict(X_val)

error = mean_squared_error(y_pred, y_val)
print(error ** (1 / 2))

3579.0881523279845
