In [1]:
!pip3 install pydotplus
!pip install graphviz



In [2]:
import statistics 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import randint

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score

#Library for decision tree
from sklearn import tree
from sklearn.metrics import classification_report
import pydotplus 
from IPython.display import Image
#confusion matrix
from sklearn.metrics import confusion_matrix,plot_confusion_matrix


## Load dataset

In [3]:
df = pd.read_csv("dataset/tennis_players.csv", skipinitialspace=True, sep=',', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2072 entries, 0 to 3885
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   2072 non-null   object 
 1   ratio                  2072 non-null   float64
 2   num_matches_2016-2019  2072 non-null   float64
 3   ratio_2016-2019        2072 non-null   float64
 4   num_matches_2020-2021  2072 non-null   float64
 5   ratio_2020-2021        2072 non-null   float64
 6   hand                   2072 non-null   object 
 7   gender                 2072 non-null   object 
 8   ioc                    2072 non-null   object 
 9   birth                  2072 non-null   float64
 10  ht                     2072 non-null   float64
 11  minutes                2072 non-null   float64
 12  perc_ace               2072 non-null   float64
 13  serv_won_tot_seve      2072 non-null   float64
 14  bpFaced                2072 non-null   float64
 15  perc

### Adding player's rank

In [4]:
df_rank = pd.read_csv('../Task1/dataset/matches_datacleaning.csv', index_col = 0)
pd.set_option('display.max_columns', None) # to visualize all the columns
df_rank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181501 entries, 0 to 185763
Data columns (total 50 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   tourney_id          181501 non-null  object 
 1   tourney_name        181501 non-null  object 
 2   surface             181354 non-null  object 
 3   draw_size           181501 non-null  float64
 4   tourney_level       181501 non-null  object 
 5   tourney_date        181501 non-null  object 
 6   match_num           181501 non-null  float64
 7   winner_id           181501 non-null  float64
 8   winner_entry        25298 non-null   object 
 9   winner_name         181501 non-null  object 
 10  winner_hand         181501 non-null  object 
 11  winner_ioc          181501 non-null  object 
 12  winner_age          178681 non-null  float64
 13  loser_id            181501 non-null  float64
 14  loser_entry         43307 non-null   object 
 15  loser_name          181501 non-nul

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
winner_rank = df_rank[['winner_name', 'winner_rank']]
winner_rank.set_axis(['name', 'rank'], axis=1, inplace=True)

loser_rank = df_rank[['loser_name', 'loser_rank']]
loser_rank.set_axis(['name', 'rank'], axis=1, inplace=True)

player_rank = loser_rank.append(winner_rank)
player = df[['name']]
player = pd.merge(player, player_rank, how='left', on='name')
print(player, '\n# of nan:', player['rank'].isna().sum())

0         1037.0
1         1254.0
2         1255.0
3         1047.0
4         1055.0
           ...  
257926     343.0
257927     335.0
257928     316.0
257929     318.0
257930     316.0
Name: rank, Length: 257931, dtype: float64 
# of nan: 7622


In [6]:
#osserviamo quanti giocatori hanno QUALCHE nan
nan_name = player[player['rank'].isna()]
nan_name['name'].nunique()

763

In [7]:
#prendiamo la media del rank e vediamo se sono rimasti nan (per quelli che avranno nan come rank vuol dire che nel db originale 
# non era presente nessun valore di rank per quel giocatore)
player = player.groupby('name').mean()
player.isna().sum()

rank    35
dtype: int64

In [22]:
player[player['rank'].isna()]

Unnamed: 0_level_0,rank
name,Unnamed: 1_level_1
ABIR EL FAHIMI,
ALEX LAWSON,
AMAL SULTANBEKOV,
AMARNI BANKS,
ANASTASIA IAMACHKINE,
ANNA BOWTELL,
BARBORA PALICOVA,
BREANA STAMPFLI,
CHARLOTTE KEMPENAERS POCZ,
DANIELLE ANDREA THOMPSON,


In [None]:
#POSSIAMO PROCEDERE IN diversi MODI:
# 1.cerchiamo di risolvere i nan 
# 2. lasciamo i giocatori che hanno nan e assegniamo loro un label speciale (non è insolito, bisogna vedere meglio come trattarli)
# 3. altro


## Transform categorical features into numerical ones

In [8]:
#function to discretize the variables
#input: the dataset and the list of variables' names to discretize
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

In [9]:
#discretize the categorical variables
variables = ['hand', 'gender', 'ioc']
df = discretize_data(df, variables)

In [10]:
#drop the categorical variables since we don't need them anymore 
df.drop(columns=['name','hand', 'gender', 'ioc'], axis=1,inplace=True)


In [11]:
df.describe()

Unnamed: 0,ratio,num_matches_2016-2019,ratio_2016-2019,num_matches_2020-2021,ratio_2020-2021,birth,ht,minutes,perc_ace,serv_won_tot_seve,bpFaced,perc_df,perc_2ndwon,perc_v_ace,perc_v_df,perc_v_1stwon,hand_num,gender_num,ioc_num
count,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0,2072.0
mean,0.486609,100.942085,0.478162,23.541988,0.309913,1993.805502,179.104946,80.432832,6.716569,0.523839,871.927124,7.450516,43.113605,6.698571,6.355497,54.31431,1.176641,0.505792,44.944498
std,0.115073,78.005562,0.134808,26.280039,0.264866,5.063032,8.00887,16.122646,5.324856,0.146481,720.860044,3.444339,12.302156,4.583606,1.962237,19.24605,0.57105,0.500087,29.383334
min,0.133333,0.0,0.0,0.0,0.0,1977.0,157.0,55.472222,0.0,0.037581,0.0,0.0,0.0,0.07,0.0,0.91,0.0,0.0,0.0
25%,0.423077,28.0,0.415466,0.0,0.0,1990.0,173.0,62.60785,2.4,0.505333,230.0,5.22,40.77,2.55,5.39,39.4275,1.0,0.0,20.0
50%,0.5,82.0,0.5,12.0,0.375,1994.0,178.0,82.359127,5.755,0.580575,638.5,6.95,47.295,6.155,6.615,64.305,1.0,1.0,41.0
75%,0.565673,170.0,0.566866,45.0,0.533333,1998.0,185.705882,94.008066,9.84,0.614162,1446.5,9.1725,50.1125,10.3125,7.5525,69.8,2.0,1.0,73.0
max,0.845161,309.0,1.0,101.0,1.0,2006.0,198.0,145.073643,35.39,0.949992,3834.0,29.86,94.18,23.31,17.89,80.79,2.0,1.0,95.0


## Labels

Abbiamo bisogno di capire qual'è il target associato ad ogni insieme per fare il processo di classificazione, un idea potrebbe essere utilizzando un attributo specifico che tiene tratta delle "performance" di ogni giocatore, in modo da poter contraddistingure i giocatori più forti da quelli più deboli.

Nota: questa non è una metrica efficate perchè il vero label del giocatore viene calcolato in base al relativo ranking ma nel nostro caso avendo molti null risulta difficile stimarlo quindi se non ci sono altre alternative potrebbe essere la soluzione più efficente 

Quindi possiamo usare i percentili o i quartili

In [12]:
# label tipo1
quantile_a = df.ratio.quantile(0.33)
quantile_b = df.ratio.quantile(0.66)

# label tipo2
#quantile_a = df.ratio.quantile(0.25)
#quantile_b = df.ratio.quantile(0.75)

In [13]:
labels = []
for customer, ratio in df.ratio.items():
    if ratio < quantile_a:
        labels.append(0)
    elif ratio > quantile_b:
        labels.append(2)
    else:
        labels.append(1)

In [14]:
class_names = ['low-level', 'medium-level', 'high-level']

In [15]:
classes = pd.DataFrame()
classes['labels'] = labels
classes.labels.value_counts()

2    705
1    684
0    683
Name: labels, dtype: int64

### Prepare dataset (splitting)

In [16]:
train_set, test_set, train_label, test_label = train_test_split(df, labels, stratify=labels, test_size=0.30)

# Classification 

## Classfication with decision tree

### Grid Search

In [17]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score

dt_max_depth = [2,3,5,6,7,10,12, None]
dt_min_samples_split = randint(2, 51)
min_samples_leaf = randint(1, 51)
criterion = ["entropy", "gini"]
splitter = ["best", "random"]
max_features = [None, 2, 3, 4, 5]

dt_param_grid = {
    "max_depth": dt_max_depth,
    "min_samples_split": dt_min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "criterion": criterion,
    "splitter": splitter,
    "max_features": max_features
            }

#define the number of iters (random combination to try)
n_iter_search = 500

#define the number of jobs to run in parallel
n_jobs_search = -1

#define the grid search
dt_grid = RandomizedSearchCV(tree.DecisionTreeClassifier(), param_distributions=dt_param_grid, 
                            n_iter=n_iter_search,  
                            n_jobs=n_jobs_search,
                            scoring=make_scorer(accuracy_score))

In [18]:
dt_grid.fit(train_set, train_label)

RandomizedSearchCV(estimator=DecisionTreeClassifier(), n_iter=500, n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [2, 3, 5, 6, 7, 10, 12,
                                                      None],
                                        'max_features': [None, 2, 3, 4, 5],
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001E8D59400D0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001E8D5944A90>,
                                        'splitter': ['best', 'random']},
                   scoring=make_scorer(accuracy_score))

In [19]:
print(dt_grid.best_params_)


{'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 15, 'splitter': 'best'}


### Training

In [20]:
#define a decision tree and fit it
dt = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', 
                                  max_depth=10, 
                                  min_samples_split=11, min_samples_leaf=7)
dt = dt.fit(train_set, train_label)

In [21]:
#visualize the actual decision tree obtained  
dot_data = tree.export_graphviz(dt, out_file=None, 
                         feature_names=list(train_set.columns),  
                         class_names=class_names,  
                         filled=True, rounded=True)  

graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())


InvocationException: GraphViz's executables not found

In [None]:
#predict using the decision tree
#the predict function returns the actual predicted labels: we need them for the evaluation phase
train_pred_dt = dt.predict(train_set)
test_pred_dt = dt.predict(test_set)

### Evaluation of the decision tree: compute accuracy, precision, recall, confusion matrix


In [None]:
#evaulate the accuracy on the train set and the test set
#metrics also contains precision, recall, f1 and the support
print('Accuracy train set ', metrics.accuracy_score(train_label, train_pred_dt))
print('Accuracy test set ', metrics.accuracy_score(test_label, test_pred_dt))
print('Precision train set ', metrics.precision_score(train_label, train_pred_dt, average='weighted'))
print('Recall train set ', metrics.recall_score(train_label, train_pred_dt, average='weighted'))
print('F1 score train set ', metrics.f1_score(train_label, train_pred_dt, average='weighted'))
print('Support train set ', metrics.precision_recall_fscore_support(train_label, train_pred_dt))

In [None]:
#metrics computed on the test set
def report_scores(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=class_names))

In [None]:
report_scores(test_label, test_pred_dt)

In [None]:
#cross validation
#the score array for test/train scores on each cv split
#the time for fitting/scoring the estimator on the train set for each cv split
scores = cross_validate(dt, train_set, train_label, cv=3, return_train_score= True)
print('Fit time ', statistics.mean(scores['fit_time']))
print('Score time ', statistics.mean(scores['score_time']))
print('Test score ', statistics.mean(scores['test_score']))
print('Train score ', statistics.mean(scores['train_score']))

In [None]:

cm = confusion_matrix(test_label, test_pred_dt)
cm

In [None]:
#it is possible to plot the confusion matrix 
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
plot_confusion_matrix(dt, test_set, test_label)
plt.show() 

## Other Classification technique