In [1]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'gold_demand_sector.csv'
df = pd.read_csv(file_path)

# Clean the dataset
# Remove empty columns
df = df.dropna(axis=1, how='all')

# Remove extra commas and convert numeric columns
for col in df.columns[1:]:
    df[col] = df[col].replace({',': ''}, regex=True).astype(float)

# Rename columns for better readability
df.columns = ['Sector', '2020', '2021', '2022', '2023', 'Average']

# Display the cleaned dataset
print(df.head())

                        Sector    2020    2021    2022    2023  Average
0                    Jewellery  1324.0  2231.1  2195.9  2190.5   1985.4
1                   Technology   309.0   337.2   314.8   305.2    316.5
2                   Investment  1794.9   991.5  1112.5   945.3   1211.1
3  Central banks & other inst.   254.9   450.1  1081.9  1049.1    709.0
4                OTC and other  1057.7   694.0    55.7   460.0    566.9


In [2]:
from scipy.stats import zscore

# Calculate Z-scores for numeric columns
numeric_cols = df.select_dtypes(include=['float64']).columns
z_scores = df[numeric_cols].apply(zscore)

# Define a threshold for Z-scores
threshold = 3

# Identify rows without outliers
non_outliers = (z_scores.abs() < threshold).all(axis=1)

# Filter the dataframe to remove outliers
df_no_outliers = df[non_outliers]

# Display the dataframe without outliers
print(df_no_outliers)

                        Sector    2020    2021    2022    2023  Average
0                    Jewellery  1324.0  2231.1  2195.9  2190.5   1985.4
1                   Technology   309.0   337.2   314.8   305.2    316.5
2                   Investment  1794.9   991.5  1112.5   945.3   1211.1
3  Central banks & other inst.   254.9   450.1  1081.9  1049.1    709.0
4                OTC and other  1057.7   694.0    55.7   460.0    566.9
5                 Total demand  4740.5  4704.0  4760.8  4950.0   4788.8


In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define features (X) and target (y)
X = df_no_outliers[numeric_cols.drop('Average')]
y = df_no_outliers['Average']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Make predictions
y_pred = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 845948.56


In [4]:
from sklearn.metrics import r2_score, mean_absolute_error

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

NameError: name 'y_test' is not defined