In [1]:
import warnings

# Filter specific warning
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import random

df_org = pd.read_csv('../Dataset/kl.csv')
df_org = df_org.head(10000)

col = ['Age', 'Overall', 'Potential', 'Wage', 'Special', 'Preferred Foot', 'International Reputation',
       'Skill Moves', 'Height', 'Weight', 'Club', 'Value', 'FKAccuracy', 'Agility']
df = df_org[col]

df.columns = ['Age', 'Overall', 'Potential', 'Wage', 'Special', 'PreferredFoot', 'InternationalReputation',
              'SkillMoves', 'Height', 'Weight', 'Club', 'Value',  'FKAccuracy', 'Agility']



In [3]:
# Changing the format of several string variabls
val = []
for i in df['Value']:
    if i[-1] == 'M':
        val.append(float(i[1:-1]) * 1000000)
    elif i[-1] == 'K':
        val.append(float(i[1:-1]) * 1000)
    else:
        val.append(0)
df.loc[:, 'Value'] = val

val = []
for i in df['Wage']:
    if i[-1] == 'M':
        val.append(float(i[1:-1]) * 1000000)
    elif i[-1] == 'K':
        val.append(float(i[1:-1]) * 1000)
    else:
        val.append(0)
df.loc[:, 'Wage'] = val

height = df.Height.str.split('\'', 1)
height0 = []
height1 = []
for i in height:
    if type(i) is list:
        height0.append(i[0])
        height1.append(i[1])
    else:
        height0.append(0)
        height1.append(0)

height_cm = []
for i in range(len(height)):
    height_cm.append(round((float(height0[i]) * 30.48 + float(height1[i]) * 2.54) / 100, 3))
df.loc[:, 'Height'] = height_cm

df['Weight'] = pd.to_numeric(df['Weight'].str.rstrip('lbs'))

df = df.iloc[np.where((df['Value'] > 0) & (df['Height'] > 0))]

df.rename({'Club': 'cluster'}, axis=1, inplace=True)
df.rename({'Value': 'y'}, axis=1, inplace=True)

In [4]:
# енкодинг
df['PreferredFoot'] = pd.Categorical(df['PreferredFoot']).codes
df['cluster'] = pd.Categorical(df['cluster']).codes
codes = pd.Categorical(df['cluster']).unique()

In [5]:
codes

[206, 316, 421, 363, 362, ..., 154, 353, 559, 569, 121]
Length: 630
Categories (630, int64): [206, 316, 421, 363, ..., 353, 559, 569, 121]

In [6]:
# нормалізувати (наблизити до 0)
min(df['y'])

40000.0

In [7]:
min(df['Wage'])

1000.0

In [8]:
df['y'] = df['y'] / 10000
df['Wage'] = df['Wage'] / 1000

In [9]:
df.head(7)

Unnamed: 0,Age,Overall,Potential,Wage,Special,PreferredFoot,InternationalReputation,SkillMoves,Height,Weight,cluster,y,FKAccuracy,Agility
0,31.0,94.0,94,565.0,2202,0,5.0,4.0,1.702,159,206,11050.0,94.0,91.0
1,33.0,94.0,94,405.0,2228,1,5.0,5.0,1.88,183,316,7700.0,76.0,87.0
2,26.0,92.0,93,290.0,2143,1,5.0,5.0,1.753,150,421,11850.0,87.0,96.0
3,27.0,91.0,93,260.0,1471,1,4.0,1.0,1.93,168,363,7200.0,19.0,60.0
4,27.0,91.0,92,355.0,2281,1,4.0,4.0,1.803,154,362,10200.0,83.0,79.0
5,27.0,91.0,91,340.0,2142,1,4.0,4.0,1.727,163,132,9300.0,79.0,95.0
6,32.0,91.0,91,420.0,2280,1,4.0,4.0,1.727,146,455,6700.0,78.0,93.0


## Train Test split

In [10]:
## Дані будуть розділені на тренувальну і тестувальну вибірки, так щоб не було однакових значень 'cluster' між цими вибірками.
version = 'CV'
depth = 6
NoCov = 2

In [11]:
labels = np.array(df['y'])
features= df.drop('y', axis = 1)

feature_list = list(features.columns)
features = np.array(features)

In [12]:
missing_values = df.isna().sum()
print(missing_values)
df = df.dropna()

Age                        1
Overall                    1
Potential                  0
Wage                       0
Special                    0
PreferredFoot              0
InternationalReputation    0
SkillMoves                 0
Height                     0
Weight                     0
cluster                    0
y                          0
FKAccuracy                 0
Agility                    0
dtype: int64


In [14]:
sample_clusters = random.sample(set(df.cluster.unique()), 100)
df_tr = df.loc[df['cluster'].isin(sample_clusters)]
train_labels = np.array(df_tr['y'])

df_test = df.loc[~df['cluster'].isin(sample_clusters)]
test_labels = np.array(df_test['y'])

print('Training Features Shape:', df_tr.shape)
print('Training Labels Shape:', df_tr['y'].shape)
print('Testing Features Shape:', df_test.shape)
print('Testing Labels Shape:', df_test['y'].shape)

Training Features Shape: (1470, 14)
Training Labels Shape: (1470,)
Testing Features Shape: (8367, 14)
Testing Labels Shape: (8367,)


## The analysis

# RETCO

In [None]:
from RealData import ClusteredRETCO as RETCOfile

RETCO = RETCOfile.CreateTree(version, df_tr, depth=depth, min_leaf_sample=3, StoppingRule='Yes', random_subspace=None)

In [None]:
RETCO

In [None]:
covariates_pred_test = df_test.iloc[:, :-NoCov].values
col_names = df_test.iloc[:, :-NoCov].columns

In [None]:
prediction_test_RETCO = RETCOfile.predictionFun(col_names, covariates_pred_test, RETCO)

In [None]:
col_names

In [None]:
y_test = df_test['y'].values

In [None]:
prediction_test_RETCO = RETCOfile.predictionFun(col_names, covariates_pred_test, RETCO)

In [None]:
errors2 = abs(prediction_test_RETCO - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute RETCO Error:', round(np.mean(errors2), 2), 'dollars.')

In [None]:
mape = 100 * (errors2 / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# Convert list to DataFrame
df1y_test = pd.DataFrame(prediction_test_RETCO, columns=['values'])

# Save DataFrame to a CSV file
df1y_test.to_csv('prediction_test_RETCO.csv', index=False)

### prediction_test_RETCO

In [None]:
covariates_pred_test = df_test.iloc[:, :-NoCov].values
col_names = df_test.iloc[:, :-NoCov].columns

In [None]:
prediction_test_RETCO = RETCOfile.predictionFun(col_names, covariates_pred_test, RETCO)

In [104]:
col_names

Index(['Age', 'Overall', 'Potential', 'Wage', 'Special', 'PreferredFoot',
       'InternationalReputation', 'WeakFoot', 'SkillMoves', 'Height',
       'Weight'],
      dtype='object')

In [108]:
y_test = df_test['y'].values

In [110]:
prediction_test_RETCO = RETCOfile.predictionFun(col_names, covariates_pred_test, RETCO)

In [92]:
errors2 = abs(y_test-prediction_test_RETCO)
# Print out the mean absolute error (mae)
print('Mean Absolute RETCO Error:', round(np.mean(errors2), 2), 'dollars.')

Mean Absolute RETCO Error: 201.72 dollars.


In [78]:
errors2

array([[287.24500488],
       [353.06587299],
       [328.08378103],
       [288.80413864],
       [420.37833349],
       [322.67236787],
       [366.29838792],
       [476.23061645],
       [378.37837629],
       [277.67512225],
       [128.19499155],
       [320.96583302],
       [316.8546917 ],
       [266.71470926],
       [239.49405412],
       [140.1552158 ],
       [310.52368016],
       [299.60081962],
       [276.64699342],
       [456.63464113],
       [161.46880449],
       [382.6586571 ],
       [490.86267894],
       [317.76373002],
       [437.37791988],
       [206.66491036],
       [171.00851093],
       [247.05134849],
       [333.64609787],
       [168.10295219],
       [295.3255621 ],
       [219.90579192],
       [178.38752186],
       [156.99838334],
       [272.0551569 ],
       [397.24084871],
       [333.51322807],
       [157.28356706],
       [499.90895146],
       [244.98332041],
       [120.57999883],
       [302.07374595],
       [388.49528084],
       [339

In [93]:
mape = 100 * (errors2 / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 82.99 %.


In [80]:
# Convert list to DataFrame
df1y_test = pd.DataFrame(prediction_test_RETCO, columns=['values'])

# Save DataFrame to a CSV file
df1y_test.to_csv('prediction_test_RETCO_lab2.csv', index=False)