<img src="https://github.com/TarikKaanKoc/SCOUT-UM-Talenter.Hunting.Classification/blob/main/image_read.png?raw=True" width="8000" align="center">

<div class="alert alert-primary" style="margin-top: 20px">

    
<strong><h2>Bussines Problem</h2><p>
    
Predicting which class (average, highlighted) player is according to the points given to the characteristics of the players. (Scenario)</p>
    
    
<strong><h2>Dataset Story</h2></strong>
<blockquote><p><strong>The data set consists of information from Scoutium, which includes the features and scores of the football players evaluated by the scouts according to the characteristics of the footballers observed in the matches.</strong></p>
</blockquote>

</div>

---

<strong><h3>Scoutium Attributes CSV File</h3><p>

---
<li><strong>Total Features : 8</strong></li>
<li><strong>Total Row : 10.730</strong> </li>
<li><strong>CSV File Size : 527 KB</strong></li>
    
---

 Sr. | Feature  | Description |
--- | --- | --- 
1 | task_response_id | The set of a scout's assessments of all players on a team's roster in a match
2 | match_id | The id of the match
3 | evaluator_id | The id of the evaluator(scout)
4 | player_id | The id of the player
5 | position_id | The id of the position played by the relevant player in that match. 1-Goalkeeper, 2-Stopper, 3-Right-back, 4-Left-back, 5-Defensive midfielder, 6-Central midfield, 7-Right wing, 8-Left wing, 9-Attacking midfielder, 10-Striker
6 | analysis_id | A set containing a scout's attribute evaluations of a player in a match
7 | attribute_id | The id of each attribute the players were evaluated for
8 | attribute_value | Value (points) given by a scout to a player's attribute


---
    

<strong><h3>Scoutium Potential Labels CSV File</h3><p>
    
---
<li><strong>Total Features : 5</strong></li>
<li><strong>Total Row : 322</strong> </li>
<li><strong>CSV File Size : 12 KB</strong></li>
    
---

 Sr. | Feature  | Description |
--- | --- | --- 
1 | task_response_id | The set of a scout's assessments of all players on a team's roster in a match
2 | match_id | The id of the match
3 | evaluator_id | The id of the evaluator(scout)
4 | player_id | The id of the player
5 | potential_label | Label showing the final decision of an observer regarding a player in the match. (target variable)

---

In [None]:
import pandas as pd
import numpy as np
from termcolor import colored
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)
from plotly.subplots import make_subplots
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,roc_auc_score
pd.set_option('display.width', 600)

In [None]:
sc_attributes = pd.read_csv('../input/scoutium/scoutium_attributes.csv', sep=';')
sc_potential_labels = pd.read_csv('../input/scoutium/scoutium_potential_labels.csv', sep=';')

In [None]:
result = pd.merge(sc_attributes,sc_potential_labels, how="right", on=["task_response_id", "match_id","evaluator_id","player_id"])

In [None]:
df = result.copy()
df.head()

In [None]:
def missing_values_analysis(df):
    na_columns_ = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns_].isnull().sum().sort_values(ascending=False)
    ratio_ = (df[na_columns_].isnull().sum() / df.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio_, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df).sort_values(by="Ratio", ascending=False)
    return missing_df


def check_df(df, head=5, tail=5):
    print(" SHAPE ".center(60, '~'))
    print('Observations -------> {}'.format(df.shape[0]))
    print('Features     -------> {}'.format(df.shape[1]))
    print(f"Shape of dataset: {colored(df.shape, 'red')}")
    print(" Types of Features ".center(60, '~'))
    print(df.dtypes,"\n")
    print(" Dataframe - Head ".center(60, '~'))
    print("\n",df.head(head),"\n")
    print(' Dataframe - TAIL '.center(60, '~'))
    print("\n",df.tail(tail),"\n")
    print(" Missing Values Analysis ".center(60, '~'))
    print("\n",missing_values_analysis(df),"\n")
    print(' Duplicate Values Analysis '.center(60, '~'))
    print("\n",df.duplicated().sum(),"\n")
    print(" QUANTILES ".center(60, '~'))
    print("\n",df.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T,"\n")


check_df(df)

In [None]:
df.drop(df[df['position_id'] == 1].index, inplace = True)

In [None]:
df[['position_id']].value_counts()

In [None]:
df.drop(df[df['potential_label'] == 'below_average'].index, inplace = True)

In [None]:
df[['potential_label']].value_counts()

In [None]:
print(f"The shape of DataFrame is {colored(df.shape,'red')}")

In [None]:
output = pd.pivot_table(data=df, 
                        index=['player_id','position_id','potential_label'], 
                        columns=['attribute_id'], 
                        values='attribute_value'
                        )
output

In [None]:
output.info()

In [None]:
output.reset_index(inplace=True)
output = output.astype(str)
output

In [None]:
output.info()

In [None]:
def label_encoder(df, column):
    labelencoder = LabelEncoder()
    df[column] = labelencoder.fit_transform(df[column])
    return df

output = label_encoder(output, 'potential_label')
output.head()

In [None]:
output.columns = output.columns.astype(str)
output.columns 

In [None]:
num_cols = output.columns[3:]
num_cols

In [None]:
ss = StandardScaler()
output[num_cols] = ss.fit_transform(output[num_cols])

In [None]:
y = output["potential_label"]
X = output.drop(["potential_label", "player_id"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 123,
                                                    stratify = y,
                                                    test_size = 0.2, 
                                                    shuffle = True)

print(f"The shape of X_train is --> {colored(X_train.shape,'red')}")
print(f"The shape of X_test is  --> {colored(X_test.shape,'red')}")
print(f"The shape of y_train is --> {colored(y_train.shape,'red')}")
print(f"The shape of y_test is  --> {colored(y_test.shape,'red')}")

In [None]:
# Lets create model:
def classification_models(model):
    y_pred=model.fit(X_train,y_train).predict(X_test)
    accuracy=accuracy_score(y_pred,y_test)
    roc_score=roc_auc_score(y_pred,model.predict_proba(X_test)[:,1])
    f1=f1_score(y_pred,y_test)
    precision=precision_score(y_pred,y_test)
    recall=recall_score(y_pred,y_test)
    
    results=pd.DataFrame({"Values":[accuracy,roc_score,f1,precision,recall],
                         "Metrics":["Accuracy","ROC-AUC","F1","Precision","Recall"]})
    
    # Visualize Results:
    fig=make_subplots(rows=1,cols=1)
    fig.add_trace(go.Bar(x=[round(i,5) for i in results["Values"]],
                        y=results["Metrics"],
                        text=[round(i,5) for i in results["Values"]],orientation="h",textposition="inside",name="Values",
                        marker=dict(color=["indianred","firebrick","palegreen","skyblue","plum"],line_color="beige",line_width=1.5)),row=1,col=1)
    fig.update_layout(title={'text': model.__class__.__name__ ,
                             'y':0.9,
                             'x':0.5,
                             'xanchor': 'center',
                             'yanchor': 'top'},
                      template='plotly_white')
    fig.update_xaxes(range=[0,1], row = 1, col = 1)

    iplot(fig)

my_models= [
    LogisticRegression(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    GaussianNB()
    ]

for model in my_models:
    classification_models(model)

In [None]:
def plot_importance(model, features, num=len(X), save=False):

    feature_imp = pd.DataFrame({"Value": model.feature_importances_, "Feature": features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:num])
    plt.title("Features")
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig("importances.png")

model = GradientBoostingClassifier()
model.fit(X, y)

plot_importance(model, X)