In [None]:
import statistics 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import randint as sp_randint
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import scikitplot as skplt

#confusion matrix
from sklearn.metrics import confusion_matrix,plot_confusion_matrix


from sklearn.tree import export_graphviz
import graphviz
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

## Load dataset

In [None]:
df = pd.read_csv("dataset/tennis_players.csv", skipinitialspace=True, sep=',', index_col=0)
df.info()

### Adding player's rank

In [None]:
df_rank = pd.read_csv('../Task1/dataset/matches_datacleaning.csv', index_col = 0)
pd.set_option('display.max_columns', None) # to visualize all the columns
df_rank.info()

In [None]:
winner_rank = df_rank[['winner_name', 'winner_rank']]
winner_rank.set_axis(['name', 'rank'], axis=1, inplace=True)

loser_rank = df_rank[['loser_name', 'loser_rank']]
loser_rank.set_axis(['name', 'rank'], axis=1, inplace=True)

player_rank = loser_rank.append(winner_rank)
player = df[['name']]
player = pd.merge(player, player_rank, how='left', on='name')
print(player, '\n# of nan:', player['rank'].isna().sum())

In [None]:
#osserviamo quanti giocatori hanno QUALCHE nan
nan_name = player[player['rank'].isna()]
nan_name['name'].nunique()

In [None]:
#prendiamo la media del rank e vediamo se sono rimasti nan (per quelli che avranno nan come rank vuol dire che nel db originale 
# non era presente nessun valore di rank per quel giocatore)
player = player.groupby('name').mean()
player['rank']=player['rank'].round(0)
player['rank'].describe()

In [None]:
player.isna().sum()

In [None]:
player['rank'].nunique()

In [None]:
player[player['rank'].isna()]

In [None]:
player.reset_index(inplace = True)
player

In [None]:
df = pd.merge(df, player, how='left', on='name')

df.describe()

## Transform categorical features into numerical ones

In [None]:
#function to discretize the variables
#input: the dataset and the list of variables' names to discretize
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

In [None]:
#discretize the categorical variables
variables = ['hand', 'gender', 'ioc']
df = discretize_data(df, variables)

In [None]:
#drop the categorical variables since we don't need them anymore 
df.drop(columns=['name','hand', 'gender', 'ioc'], axis=1,inplace=True)


In [None]:
df.info()

## Labels

Abbiamo bisogno di capire qual'è il target associato ad ogni insieme per fare il processo di classificazione, un idea potrebbe essere utilizzando un attributo specifico che tiene tratta delle "performance" di ogni giocatore, in modo da poter contraddistingure i giocatori più forti da quelli più deboli.

Nota: questa non è una metrica efficate perchè il vero label del giocatore viene calcolato in base al relativo ranking ma nel nostro caso avendo molti null risulta difficile stimarlo quindi se non ci sono altre alternative potrebbe essere la soluzione più efficente 

Quindi possiamo usare i percentili o i quartili

In [None]:
df.sort_values(by=['rank'])

In [None]:
# label tipo1
quantile_a = df['rank'].quantile(0.25)


In [None]:
blindtest=df[df['rank'].isna()]
del blindtest['rank']

In [None]:
df = df.drop(df[df['rank'].isna()].index)


In [None]:
labels = []
for index, rank in df['rank'].items():
    if rank <= quantile_a:
        labels.append(0)
    else:
        labels.append(1)

In [None]:
class_names = ['high-level', 'low-level']

In [None]:
classes = pd.DataFrame()
classes['labels'] = labels
classes.labels.value_counts(True)

### Prepare dataset (splitting)

In [None]:
del df['rank']

In [None]:
#use stratify because database contain unbalanced label, in this way it's possible to mantain this percentage 
#of label in train and test set 
train_set, test_set, train_label, test_label = train_test_split(df, labels, stratify=labels, test_size=0.25)

In [None]:
np.size(train_label)

# Classification 

## Classfication with decision tree

In [None]:
#Library for decision tree
from sklearn import tree
from sklearn.metrics import classification_report
import pydotplus 
from IPython.display import Image

### Grid Search

In [None]:
dt_max_depth = [2,3,5,6,7,10,12,None]
dt_min_samples_split = sp_randint(2, 51)
min_samples_leaf = sp_randint(1, 51)
criterion = ["entropy", "gini"]
splitter = ["best", "random"]
max_features = [None, 2, 3, 4, 5]

dt_param_grid = {
    "max_depth": dt_max_depth,
    "min_samples_split": dt_min_samples_split.rvs(5),
    "min_samples_leaf": min_samples_leaf.rvs(5),
    "criterion": criterion,
    "splitter": splitter,
    "max_features": max_features,
    }


#define the grid search
dt_grid = GridSearchCV(tree.DecisionTreeClassifier(), param_grid=dt_param_grid, 
                            scoring=make_scorer(f1_score))

In [None]:
best_model = dt_grid.fit(train_set, train_label)
print(dt_grid.best_params_)
train_pred_dt = dt_grid.predict(train_set)
test_pred_dt = dt_grid.predict(test_set)

In [None]:
blind_pred_dt = dt_grid.predict(blindtest)

In [None]:
dot_data = export_graphviz(best_model.best_estimator_, out_file=None, 
            filled=True, rounded=True, class_names=['0','1'])

In [None]:
graph = pydotplus.graph_from_dot_data(dot_data)
print(graph.to_string())
#Image(graph.create_png()) AGGIUSTARE


### Evaluation of the decision tree

In [None]:
#evaulate the accuracy on the train set and the test set
#metrics also contains precision, recall, f1 and the support
### PUò ESSERE UTILE CALCOLARE LA PROBABILITà DI APPARTENZA DI UN RECORD AD UNA CLASSE O ALL'ALTRA (UTILE PER ROC CURVE)
print(classification_report(train_label, train_pred_dt, target_names=class_names))

In [None]:
print(classification_report(test_label, test_pred_dt, target_names=class_names))

In [None]:
cm = confusion_matrix(test_label, test_pred_dt)
plot_confusion_matrix(dt_grid, test_set, test_label)
plt.show() 

In [None]:
test_pred_proba_dt = dt_grid.predict_proba(test_set)
skplt.metrics.plot_roc(test_label, test_pred_proba_dt)

In [None]:
#ace and minutes
plt.scatter(test_set['minutes'], test_set['perc_ace'], c=test_label, s=10);

## Bayesian classifier

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [None]:
gnb = GaussianNB()
gnb.fit(train_set, train_label)
train_pred_gnb = gnb.predict(train_set)
#predict on the test set
test_pred_gnb = gnb.predict(test_set)

In [None]:
#compute the performance of the model
print(classification_report(train_label, train_pred_gnb, target_names=class_names))

In [None]:
print(classification_report(test_label, test_pred_gnb, target_names=class_names))

In [None]:
test_pred_proba_gnb = gnb.predict_proba(test_set)
test_pred_proba_gnb
skplt.metrics.plot_roc(test_label, test_pred_proba_gnb)

### 

## Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
def create_model(n_neurons1, activation1, n_neurons2, activation2, activation_out, loss): 
    # create model
    model = Sequential()
    model.add(Dense(n_neurons1, input_dim=19, activation=activation1)) 
    model.add(Dense(n_neurons2, activation=activation2)) 
    model.add(Dense(1, activation=activation_out))
    # Compile model
    model.compile(loss=loss, metrics=["accuracy", "MeanSquaredError"])
    return model

In [None]:
grid_param = {
    "n_neurons1":[3,7,22],
    "activation1":["sigmoid","softmax"],
    "n_neurons2":[5,15],
    "activation2":["relu","softmax"],
    "activation_out":["relu","sigmoid"],
    "loss":["binary_crossentropy", "mean_squared_error"]
}

nn = KerasClassifier(build_fn=create_model, epochs=150, batch_size=32)
nn_grid = RandomizedSearchCV(nn, param_distributions=grid_param, n_iter=100, n_jobs=-1, cv=5, scoring=make_scorer(f1_score))

In [None]:
best_model = nn_grid.fit(train_set, train_label)
print(nn_grid.best_params_)
train_pred_nn = nn_grid.predict(train_set)
test_pred_nn = nn_grid.predict(test_set)

In [None]:
print(classification_report(train_label, train_pred_nn, target_names=class_names))

In [None]:
print(classification_report(test_label, test_pred_nn, target_names=class_names))

In [None]:
## Knn

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier


In [None]:
train_set_no_cat = train_set.loc[:,~train_set.columns.str.contains('_num', case=False)] 
test_set_no_cat = test_set.loc[:,~test_set.columns.str.contains('_num', case=False)] 

k = math.sqrt(len(train_set))
k = round(k, 0)
k = int(k)
k = k - 2
k_range = []
for i in range(0,5):
    k_range.append(k)
    k = k + 1
k_range

In [None]:
knn_param_grid = {
    'n_neighbors': k_range,
    'metric': ['euclidean', 'manhattan'],
    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'weights': ['uniform', 'distance']
            }

knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=5, scoring=make_scorer(f1_score))

In [None]:
knn_grid.fit(train_set_no_cat, train_label)

In [None]:
train_pred_knn = knn_grid.predict(train_set_no_cat)
test_pred_knn = knn_grid.predict(test_set_no_cat)

In [None]:
print(classification_report(train_label, train_pred_knn, target_names=class_names))

In [None]:
print(classification_report(test_label, test_pred_knn, target_names=class_names))