In [96]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, validation_curve
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import urllib.request




In [108]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/Shemeazza/ML4DS-D/main/"
TRAIN_DATA_PATH = os.path.join("datasets", "water_bill")
TRAIN_DATA_URL = DOWNLOAD_ROOT + TRAIN_DATA_PATH



def fetch_train_data(train_url=TRAIN_DATA_URL, train_path=TRAIN_DATA_PATH):
    if not os.path.isdir(train_path):
        os.makedirs(train_path)
    csv_path = os.path.join(train_path, "train.csv")
    urllib.request.urlretrieve(train_url, csv_path)
    train_data = pd.read_csv(train_path)

In [109]:

# train_data = pd.read_csv(train_data_path)

train_data['YearMonth'] = train_data['Year'].astype(str) + train_data['Month'].astype(str).str.zfill(2)

x = train_data[['Year', 'Month', 'Consumption', 'Installation_zone']]
y = train_data['Consumer_type']



In [110]:
%matplotlib inline

mpl.rc('axes', labelsize = 14)
mpl.rc('xtick', labelsize = 12)
mpl.rc('ytick', labelsize = 12)
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300 ):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format = fig_extension, dpi = resolution)    


In [100]:
encoded_labels = LabelEncoder()
x.loc[:, 'Installation_zone'] = encoded_labels.fit_transform(x['Installation_zone'])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(x_train.shape)
print(x_test.shape)



(263980, 4)
(65995, 4)


In [101]:
model = tree.DecisionTreeClassifier(random_state=42).fit(x_train,y_train)

y_test_pred = model.predict(x_test)


In [102]:
acc_test = accuracy_score(y_test, y_test_pred)
print("\nModel Acc:", acc_test)




Model Acc: 0.9058868096067884


In [103]:
competiton_path = "competition.csv"
competition_data = pd.read_csv(competiton_path)

for col in ['Installation_zone']:
    unknown_labels = set(competition_data[col]) - set(encoded_labels.classes_)
    if unknown_labels:
        encoded_labels.classes_ = np.concatenate([encoded_labels.classes_, ['unknown_label']])
        competition_data[col] = competition_data[col].replace(unknown_labels, 'unknown_label')



In [104]:
competition_data['YearMonth'] = competition_data['Year'].astype(str) + competition_data['Month'].astype(str).str.zfill(2)
competition_data.loc[:, 'Installation_zone'] = encoded_labels.transform(competition_data['Installation_zone'])

In [105]:
x_comp = competition_data[['Year', 'Month', 'Consumption', 'Installation_zone']]

missing = set(x_train.columns) - set(x_comp.columns)
if missing:
    raise ValueError(f"Missing features: {missing}")


In [106]:
prediction = model.predict(x_comp)
results_data = pd.DataFrame({
    'Consumer_number': competition_data['Consumer_number'],
    'Consumer_type': prediction
})

In [107]:
results_data.to_csv("predicted_results.csv", index=False)
