## Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
raw_train = pd.read_csv('train.csv')
raw_test = pd.read_csv('test.csv')
train = raw_train.copy()
test = raw_test.copy()
pd.set_option('display.max_columns', 60)

# Notes

In [3]:
# Ability -> Scale, change 0s with mean to see diff, 
# Potential -> Fill NaNs with 0s as 'no potential' scale, try changing NaNs with mean, drop the column try again, bin (low med high) , if potential 1 else 0
# Positions -> try 'category' astype first, then ohe
# Caps / Goals -> Seperated to two columns 
# Foot -> astype(int)
# Height -> Scale
# Weight -Scale
# Aerial Reach -> fill nans with 0 first Scale , then try bin (low med high), if aerial 1 else 0, drop
# Command of Area -> fill nans with 0 first Scale , then try bin (low med high), if coa 1 else 0, drop
# Communication -> fill nans with 0 first Scale, if comm 1 else 0, drop
# Eccentricity ->  fill nans with 0 first Scale, if ecc 1 else 0, drop
# Handling -> fill 0, mean, if hand 1 else 0 , drop
# Kicking -> fill 0, mean, if kick 1 else 0 , drop
# Punching Tendency -> fill 0, mean, if kick 1 else 0 , drop
# Reflexes -> fill 0, mean, if kick 1 else 0 , drop
# Rushing Out (Tendency) -> fill 0, mean, if kick 1 else 0 , drop

## Functions

In [4]:
def analyze_dataframe(df):
    """
    Function to analyze a DataFrame by printing:
    - Shape of the DataFrame
    - Information about each column (non-null count and data type)
    - Descriptive statistics for numerical columns
    - Number of missing values in each column
    - List of all column names

    Parameters:
    df (pd.DataFrame): The DataFrame to analyze
    """
    # Print shape of the DataFrame
    print("SHAPE")
    print(df.shape)
    print("\n----------------------------------------------------\n")
    
    # Print column information (non-null count, data types)
    print(df.info())
    print("\n----------------------------------------------------\n")
    
    # Print descriptive statistics for numerical columns
    print(df.describe())
    print("\n----------------------------------------------------\n")
    
    # Print count of missing values in each column
    print(df.isna().sum())
    print("\n----------------------------------------------------\n")
    
    # Print all column names
    print(df.columns)

In [5]:
def analyze_column(df, column_name):
    """
    Function to analyze a column in a DataFrame by printing:
    - Unique values
    - Value counts (including NaNs)
    - Number of missing values (NaNs)

    Parameters:
    df (pd.DataFrame): The DataFrame containing the column
    column_name (str): The name of the column to analyze
    """
    # Print unique values
    print(f"Unique values in '{column_name}':")
    print(df[column_name].unique())
    print(' ')
    
    # Print value counts (including NaNs)
    print(f"Value counts in '{column_name}' (including NaNs):")
    print(df[column_name].value_counts(dropna=False))
    print(' ')
    
    # Print the number of missing values (NaNs)
    print(f"Number of missing values (NaNs) in '{column_name}':")
    print(df[column_name].isna().sum())
    
    print(' ')
    print(df[column_name].describe())

In [6]:
def detect_outlier(df, column_name):
    """
    Lists outlier values in a specified column of a DataFrame using the IQR method.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column_name (str): The name of the column to check for outliers.

    Returns:
    list: A list of outlier values in the specified column.
    """
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    
    # Calculate the Interquartile Range (IQR)
    IQR = Q3 - Q1
    
    # Determine the outlier boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Detect outlier values
    outlier_values = df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)][column_name].tolist()
    
    return outlier_values

In [7]:
def plot_distributions(df, column):
    plt.figure(figsize=(4, 2))
    sns.histplot(df[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.show()

In [8]:
def group_positions(positions):
    # Attacking roles
    if any(pos in positions for pos in ['ST', 'AML', 'AMR', 'AMC', 'AMR AML', 'AML ST', 'AMC ST']):
        return 'Attacking'
    # Midfield roles
    elif any(pos in positions for pos in ['MC', 'AMC', 'DM', 'MC AMC', 'DM MC', 'MC MR', 'MC AML AMC']):
        return 'Midfield'
    # Defensive roles
    elif any(pos in positions for pos in ['DC', 'DL', 'DR', 'WBL', 'WBR', 'DL DC', 'DL WBL', 'DL DM MC']):
        return 'Defense'
    # Goalkeeper
    elif any(pos in positions for pos in ['GK']):
        return 'Goalkeeper'
    else:
        return 'Other'

In [9]:
def analyze_and_plot_columns_from(train_df, test_df, start_col_idx):
    """
    This function analyzes and plots the distributions of all columns starting from a specific index in both
    the training and test datasets.

    Parameters:
    - train_df: DataFrame for the training data
    - test_df: DataFrame for the test data
    - start_col_idx: The index from which to start analyzing columns
    """
    # Get all columns starting from the specified index
    columns = train_df.columns[start_col_idx:]
    
    # Loop through each column and perform analysis
    for column in columns:
        analyze_column(train, column)
        print('\n------------------------------------------------------------------\n')
        analyze_column(test, column)

        plot_distributions(train, column) 
        plot_distributions(test, column)

In [10]:
def columns_with_nulls(df):
    """
    This function returns a list of columns in the DataFrame that contain null values.

    Parameters:
    - df: DataFrame to check for null values

    Returns:
    - null_columns: A list of column names with null values
    """
    null_columns = df.columns[df.isnull().any()].tolist()
    return null_columns

In [11]:
def fill_nulls_with_mean(df, columns):
    """
    This function fills null values in the specified columns with their respective mean values.

    Parameters:
    - df: DataFrame in which to fill null values
    - columns: List of column names to fill with mean values
    """
    for column in columns:
        if column in df.columns:  # Check if the column exists in the DataFrame
            mean_value = df[column].mean()  # Calculate the mean value
            df[column] = df[column].fillna(mean_value)  # Fill nulls with mean value

In [12]:
def scale_columns(df, col_list):
    """
    Scales the specified columns in the DataFrame using StandardScaler.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    col_list (list): List of column names to scale.

    Returns:
    pd.DataFrame: The DataFrame with scaled columns.
    """
    # Initialize the StandardScaler
    scaler = StandardScaler()
    
    # Fit the scaler on the selected columns and transform them
    df[col_list] = scaler.fit_transform(df[col_list])
    
    return df

In [13]:
def assign_based_on_mean(df, col_name):
    # Calculate the mean of the specified column
    mean_value = df[col_name].mean()

    # Assign 0 if below the mean, 1 if above or equal to the mean directly to the column
    df[col_name] = df[col_name].apply(lambda x: 0 if x < mean_value else 1)
    
    return df

In [14]:
front_half_positions = [
    'ST', 'AMC', 'AMR', 'AML', 'DL WBL ML', 
    'AMR AML', 'AMR AMC', 'MC AMC', 'MR AMR', 
    'DL AML', 'DC AMC', 'DR WBR MR', 'AMR', 
    'MC MR', 'MR AMR AML', 'AML AMC ST', 
    'WBL ML', 'DL WBL', 'DM MC AMC', 'DR WBR', 
    'DL DC WBL', 'WBR MR AMR', 'ML AML', 
    'AML ST', 'AMC ST', 'AML AMC', 'WBR MR', 
    'ML AML ST', 'MC AML AMC', 'MR', 'AMR AML ST', 
    'AMR AML AMC', 'MR ML AMR AML', 'DR DC WBR', 
    'WBL AMR AML', 'ML AMR AML ST', 'MC ML', 
    'MR ML'
]

behind_half_positions = [
    'GK', 'DL', 'DC', 'DR', 'MC', 'DL DC', 
    'DR DL', 'DM MC', 'DM', 'DC DM', 'DR', 
    'DL DM MC', 'DR MC', 'DL DC DM', 'DR DC'
]

# Function to assign 1 for front half and 0 for behind half
def assign_half_positions(df, column):
    df[column] = df[column].apply(
        lambda x: 1 if x in front_half_positions else 0
    )


### Null Check

In [15]:
threshold = 30

missing_values = train.isna().sum()
total_cells = len(train) * len(train.columns)
percent_missing = train.isnull().mean() * 100

columns_above_threshold = []

html_content = "<h3>Percentage of missing values in each column:</h3>"
html_content += "<table><tr><th>Column</th><th>Missing Percentage</th></tr>"

for col, percent in zip(missing_values.index, percent_missing):
    if percent > threshold:
        color = "red"
        columns_above_threshold.append(col)
    else:
        color = "green"
    html_content += f"<tr><td>{col}</td><td style='color:{color}'>{percent:.2f}%</td></tr>"

html_content += "</table>"

display(HTML(html_content))

print("\nColumns with more than", threshold, "% missing values:")
print(columns_above_threshold)

Column,Missing Percentage
id,0.00%
value_increased,0.00%
Ability,0.00%
Potential,90.75%
Positions,0.00%
Caps / Goals,96.56%
Foot,0.00%
Height,0.00%
Weight,0.00%
Aerial Reach,93.12%



Columns with more than 30 % missing values:
['Potential', 'Caps / Goals', 'Aerial Reach', 'Command of Area', 'Communication', 'Eccentricity', 'Handling', 'Kicking', 'One on Ones', 'Punching (Tendency)', 'Reflexes', 'Rushing Out (Tendency)', 'Throwing']


In [16]:
threshold = 30

missing_values = test.isna().sum()
total_cells = len(test) * len(test.columns)
percent_missing = test.isnull().mean() * 100

columns_above_threshold = []

html_content = "<h3>Percentage of missing values in each column:</h3>"
html_content += "<table><tr><th>Column</th><th>Missing Percentage</th></tr>"

for col, percent in zip(missing_values.index, percent_missing):
    if percent > threshold:
        color = "red"
        columns_above_threshold.append(col)
    else:
        color = "green"
    html_content += f"<tr><td>{col}</td><td style='color:{color}'>{percent:.2f}%</td></tr>"

html_content += "</table>"

display(HTML(html_content))

print("\nColumns with more than", threshold, "% missing values:")
print(columns_above_threshold)

Column,Missing Percentage
id,0.00%
Ability,0.00%
Potential,88.71%
Positions,0.00%
Caps / Goals,94.84%
Foot,0.00%
Height,0.00%
Weight,0.00%
Aerial Reach,96.13%
Command of Area,96.13%



Columns with more than 30 % missing values:
['Potential', 'Caps / Goals', 'Aerial Reach', 'Command of Area', 'Communication', 'Eccentricity', 'Handling', 'Kicking', 'One on Ones', 'Punching (Tendency)', 'Reflexes', 'Rushing Out (Tendency)', 'Throwing']


In [17]:
#train['value_increased'] = train['value_increased'].apply(lambda x: 1 if x else 0)

### 1-) Ability (Scale)

In [18]:
# train
train['Ability'] = train['Ability'].replace(0, train['Ability'].mean())
analyze_column(train, 'Ability')

Unique values in 'Ability':
[46.         48.         58.         38.         47.         56.
 43.         52.         50.         23.         44.72473118 63.
 40.         53.         65.         59.         39.         29.
 54.         68.         45.         19.         75.         42.
 33.         60.         80.         35.         44.         41.
 62.         57.         20.         49.         28.         61.
 37.         26.         31.         69.         55.         51.
 67.         64.         34.         27.         30.         36.
 70.         24.         32.         71.         78.         76.        ]
 
Value counts in 'Ability' (including NaNs):
Ability
44.724731    35
53.000000    28
48.000000    24
50.000000    24
38.000000    21
43.000000    19
58.000000    18
40.000000    17
52.000000    17
55.000000    16
42.000000    14
44.000000    14
49.000000    13
45.000000    13
60.000000    12
54.000000    11
41.000000    11
57.000000    10
33.000000    10
39.000000     9
46.0

In [19]:
# test
test['Ability'] = test['Ability'].replace(0, test['Ability'].mean())
analyze_column(test, 'Ability')

Unique values in 'Ability':
[40.        42.        39.        38.        45.        57.
 58.        66.        52.        65.        54.        50.
 35.        37.        49.        78.        53.        63.
 43.        44.8516129 69.        68.        48.        55.
 62.        79.        47.        70.        46.        51.
 59.        41.        24.        32.        28.        44.
 33.        56.        31.        60.        30.        29.
 74.        64.        27.        61.        67.        26.
 23.        36.       ]
 
Value counts in 'Ability' (including NaNs):
Ability
44.851613    29
55.000000    15
38.000000    14
58.000000    13
45.000000    12
50.000000    12
43.000000    12
48.000000    12
54.000000    11
40.000000    11
57.000000    11
53.000000    10
52.000000     9
44.000000     9
59.000000     8
60.000000     8
39.000000     8
63.000000     7
51.000000     7
56.000000     7
49.000000     7
47.000000     7
42.000000     6
65.000000     5
46.000000     5
37.000000     

### 2-) Potential

In [20]:
train['Potential'] = train['Potential'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Potential'] = test['Potential'].apply(lambda x: 1 if pd.notna(x) else 0)

### 2-) Potential

In [21]:
# train
train['Potential'] = train['Potential'].fillna(train['Potential'].mean())
# test
test['Potential'] = test['Potential'].fillna(test['Potential'].mean()) 

### 3-) Positions

In [22]:
assign_half_positions(train, 'Positions')
assign_half_positions(test, 'Positions')


# train
train['Positions'] = train['Positions'].apply(group_positions)
analyze_column(train, 'Positions')

# test
test['Positions'] = test['Positions'].apply(group_positions)
analyze_column(test, 'Positions')

train = pd.get_dummies(train, columns = ['Positions'])
test = pd.get_dummies(test, columns = ['Positions'])

### 4-) Caps / Goals

In [23]:
# train
train['Caps / Goals'] = train['Caps / Goals'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Caps / Goals'] = test['Caps / Goals'].apply(lambda x: 1 if pd.notna(x) else 0)

#train
train[['Caps', 'Goals']] = train['Caps / Goals'].str.split(' / ', expand=True).fillna(0)
train['Caps'] = train['Caps'].astype(int)
train['Goals'] = train['Goals'].astype(int)
train = train.drop('Caps / Goals', axis = 1)

# test
test[['Caps', 'Goals']] = test['Caps / Goals'].str.split(' / ', expand=True).fillna(0)
test['Caps'] = test['Caps'].astype(int)
test['Goals'] = test['Goals'].astype(int)
test = test.drop('Caps / Goals', axis = 1)

### 5-) Foot

In [24]:
# train
train['Foot'] = train['Foot'].apply(lambda x: 0 if x == 'Right' else (1 if x == 'Left' else x))
# test
test['Foot'] = test['Foot'].apply(lambda x: 0 if x == 'Right' else (1 if x == 'Left' else (2 if x == 'Both' else x)))

### 6-) Height

In [25]:
# train
train['Height'] = train['Height'].str.extract(r'(\d+)').astype(int)

In [26]:
# test
test['Height'] = test['Height'].str.extract(r'(\d+)').astype(int)

### 8-) Weight

In [27]:
# train
train['Weight'] = train['Weight'].str.extract(r'(\d+)').astype(int)

In [28]:
# test
test['Weight'] = test['Weight'].str.extract(r'(\d+)').astype(int)

### 9-) Aerial Reach (Scale ????)

In [29]:
train['Aerial Reach'] = train['Aerial Reach'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Aerial Reach'] = test['Aerial Reach'].apply(lambda x: 1 if pd.notna(x) else 0)

# train
train['Aerial Reach'] = train['Aerial Reach'].fillna(train['Aerial Reach'].mean())
# test
test['Aerial Reach'] = test['Aerial Reach'].fillna(test['Aerial Reach'].mean())


### 10-) Command of Area

In [30]:
train['Command of Area'] = train['Command of Area'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Command of Area'] = test['Command of Area'].apply(lambda x: 1 if pd.notna(x) else 0)

# train
train['Command of Area'] = train['Command of Area'].fillna(train['Command of Area'].mean())
# test
test['Command of Area'] = test['Command of Area'].fillna(test['Command of Area'].mean())

### 11-) Communication

In [31]:
train['Communication'] = train['Communication'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Communication'] = test['Communication'].apply(lambda x: 1 if pd.notna(x) else 0)

# train
train['Communication'] = train['Communication'].fillna(train['Communication'].mean())
# test
test['Communication'] = test['Communication'].fillna(test['Communication'].mean())

### 12-) Eccentricity

In [32]:
train['Eccentricity'] = train['Eccentricity'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Eccentricity'] = test['Eccentricity'].apply(lambda x: 1 if pd.notna(x) else 0)

# train
train['Eccentricity'] = train['Eccentricity'].fillna(train['Eccentricity'].mean())
# test
test['Eccentricity'] = test['Eccentricity'].fillna(test['Eccentricity'].mean())

### 13-) First Touch (Sadece Scale)

### 14-) Handling

In [33]:
train['Handling'] = train['Handling'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Handling'] = test['Handling'].apply(lambda x: 1 if pd.notna(x) else 0)

train['Handling'] = train['Handling'].fillna(train['Handling'].mean())
test['Handling'] = test['Handling'].fillna(test['Handling'].mean())

### 15-) Kicking

In [34]:
train['Kicking'] = train['Kicking'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Kicking'] = test['Kicking'].apply(lambda x: 1 if pd.notna(x) else 0)

train['Kicking'] = train['Kicking'].fillna(train['Kicking'].mean())
test['Kicking'] = test['Kicking'].fillna(test['Kicking'].mean())

### 16-) One on Ones

In [35]:
train['One on Ones'] = train['One on Ones'].apply(lambda x: 1 if pd.notna(x) else 0)
test['One on Ones'] = test['One on Ones'].apply(lambda x: 1 if pd.notna(x) else 0)

train['One on Ones'] = train['One on Ones'].fillna(train['One on Ones'].mean())
test['One on Ones'] = test['One on Ones'].fillna(test['One on Ones'].mean())

### 17-) Passing (Scale)

### 18-) Punching (Tendency)

In [36]:
train['Punching (Tendency)'] = train['Punching (Tendency)'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Punching (Tendency)'] = test['Punching (Tendency)'].apply(lambda x: 1 if pd.notna(x) else 0)

train['Punching (Tendency)'] = train['Punching (Tendency)'].fillna(train['Punching (Tendency)'].mean())
test['Punching (Tendency)'] = test['Punching (Tendency)'].fillna(test['Punching (Tendency)'].mean())

### 19-) Reflexes

In [37]:
train['Reflexes'] = train['Reflexes'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Reflexes'] = test['Reflexes'].apply(lambda x: 1 if pd.notna(x) else 0)

train['Reflexes'] = train['Reflexes'].fillna(train['Reflexes'].mean())
test['Reflexes'] = test['Reflexes'].fillna(test['Reflexes'].mean())

### 20-) Rushing Out (Tendency)

In [38]:
train['Rushing Out (Tendency)'] = train['Rushing Out (Tendency)'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Rushing Out (Tendency)'] = test['Rushing Out (Tendency)'].apply(lambda x: 1 if pd.notna(x) else 0)

train['Rushing Out (Tendency)'] = train['Rushing Out (Tendency)'].fillna(train['Rushing Out (Tendency)'].mean())
test['Rushing Out (Tendency)'] = test['Rushing Out (Tendency)'].fillna(test['Rushing Out (Tendency)'].mean())

### 21-) Throwing

train['Throwing'] = train['Throwing'].fillna(train['Throwing'].mean())
test['Throwing'] = test['Throwing'].fillna(test['Throwing'].mean())

In [39]:
train['Throwing'] = train['Throwing'].apply(lambda x: 1 if pd.notna(x) else 0)
test['Throwing'] = test['Throwing'].apply(lambda x: 1 if pd.notna(x) else 0)

### 22-) Rest of the Columns

In [40]:
#analyze_and_plot_columns_from(train, test, 21)

In [41]:
null_columns_train = columns_with_nulls(train)
null_columns_test = columns_with_nulls(test)
print(null_columns_train)
print(null_columns_test)

['Corners', 'Crossing', 'Dribbling', 'Finishing', 'Heading', 'Long Shots', 'Long Throws', 'Marking', 'Tackling']
['Corners', 'Crossing', 'Dribbling', 'Finishing', 'Heading', 'Long Shots', 'Long Throws', 'Marking', 'Tackling']


In [42]:
columns_to_fill = ['Corners', 'Crossing', 'Dribbling', 'Finishing', 'Heading', 'Long Shots', 'Long Throws', 'Marking', 'Tackling']
fill_nulls_with_mean(train, columns_to_fill)
fill_nulls_with_mean(test, columns_to_fill)

In [43]:
train = train.apply(lambda x: x.astype(int) if x.dtype == 'float' else x)
train = train.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

test = test.apply(lambda x: x.astype(int) if x.dtype == 'float' else x)
test = test.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

columns_to_drop = ['Potential', 'Caps / Goals', 'Aerial Reach', 'Command of Area', 'Communication', 'Eccentricity', 'Handling', 'Kicking', 'One on Ones', 'Punching (Tendency)', 'Reflexes', 'Rushing Out (Tendency)', 'Throwing']
train = train.drop(columns = columns_to_drop)
test = test.drop(columns = columns_to_drop)

In [44]:
train.shape

(465, 56)

In [45]:
from sklearn.preprocessing import StandardScaler

exclude_columns = [
    'Positions_Attacking', 'Positions_Defense', 'Positions_Goalkeeper', 
    'Positions_Midfield', 'Positions_Other', 'Foot', 'id', 'value_increased'
]

columns_to_scale = []
columns_to_scale = train.columns.difference(exclude_columns)

train = scale_columns(train, columns_to_scale)

In [46]:
train.shape

(465, 56)

In [47]:
test.shape

(310, 55)

In [48]:
exclude_columns_test = [
    'Positions_Attacking', 'Positions_Defense', 'Positions_Goalkeeper', 
    'Positions_Midfield', 'Positions_Other', 'Foot', 'id'
]

columns_to_st = []
columns_to_st = test.columns.difference(exclude_columns_test)

test = scale_columns(test, columns_to_st)

In [49]:
test.shape

(310, 55)

# Models

import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load your data
# Assuming 'train' is your DataFrame with features and target
X = train.drop(columns=['value_increased', 'id'])
y = train['value_increased']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    depth = trial.suggest_int('depth', 4, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
    l2_leaf_reg = trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10)
    iterations = trial.suggest_int('iterations', 100, 1000)
    
    # Create and fit the model
    model = CatBoostRegressor(
        depth=depth,
        learning_rate=learning_rate,
        l2_leaf_reg=l2_leaf_reg,
        iterations=iterations,
        silent=True
    )
    
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Apply threshold to create binary predictions
    threshold = 0.5
    y_pred_binary = (y_pred > threshold).astype(int)
    
    # Calculate the F1 score
    f1 = f1_score(y_test, y_pred_binary, average='micro')
    
    return f1

# Create the Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  # Adjust n_trials as needed

# Print the best parameters and score
print('Best trial:')
trial = study.best_trial
print(f'  F1 Score: {trial.value}')
print('  Best hyperparameters: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

# Use the best parameters to train the final model
best_params = study.best_params
final_model = CatBoostRegressor(**best_params)
final_model.fit(X_train, y_train)

# Save the model
final_model.save_model('catboost_model.cbm')

# Load the model for predictions
final_model = CatBoostRegressor()
final_model.load_model('catboost_model.cbm')

# Prepare test data
X_test_final = test.drop(columns=['id'])

# Make predictions
predicted_values = final_model.predict(X_test_final)

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'id': test['id'],  # Include the ID for reference
    'value_increased': predicted_values
})

# Display the results DataFrame
print(results_df)


mean_value = results_df['value_increased'].mean()

# Calculate Standard Deviation
std_dev = results_df['value_increased'].std()

# Calculate 75th Percentile
percentile_75 = results_df['value_increased'].quantile(0.75)

# Calculate Threshold based on Mean + 2 * Standard Deviation
# threshold = mean_value + 2 * std_dev

# Display the results
print(f"Mean: {mean_value}")
print(f"Standard Deviation: {std_dev}")
print(f"75th Percentile: {percentile_75}")
# print(f"Threshold (Mean + 2 * Std Dev): {threshold}")

threshold = 0.15623102074613418
results_df['value_increased'] = (results_df['value_increased'] > threshold).astype(int)

results_df

analyze_column(results_df, 'value_increased')

results_df.to_csv('OC-ttS-optuna-wodrop.csv', index=False)

import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

X = train.drop(columns=['value_increased', 'id'])
y = train['value_increased']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(silent=True)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

threshold = 0.5
y_pred_binary = (y_pred > threshold).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_binary).ravel()
macro_avg_tnr = tn / (tn + fp)  # True Negative Rate
f1 = f1_score(y_test, y_pred_binary, average='micro')

results = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'Macro Avg TNR': macro_avg_tnr,
    'F1 Score': f1
}

# Create a DataFrame for results
results_df = pd.DataFrame([results])
print(results_df)

# Save the model
model.save_model('catboost_model.cbm')

# Load the model for predictions
model = CatBoostRegressor()
model.load_model('catboost_model.cbm')

# Prepare test data
X_test_final = test.drop(columns=['id'])

# Make predictions
predicted_values = model.predict(X_test_final)

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'id': test['id'],  # Include the ID for reference
    'value_increased': predicted_values
})


analyze_column(results_df, 'value_increased')

mean_value = results_df['value_increased'].mean()

# Calculate Standard Deviation
std_dev = results_df['value_increased'].std()

# Calculate 75th Percentile
percentile_75 = results_df['value_increased'].quantile(0.75)

# Calculate Threshold based on Mean + 2 * Standard Deviation
threshold = mean_value + 2 * std_dev

# Display the results
print(f"Mean: {mean_value}")
print(f"Standard Deviation: {std_dev}")
print(f"75th Percentile: {percentile_75}")
print(f"Threshold (Mean + 2 * Std Dev): {threshold}")

threshold = 0.015909 # en yüksek score burdan geldi ( test scale edilmedi)
# threshold = 0.16573261999014152
# threshold = 0.13683123347655846
# threshold = 0.25082897240713037
# threshold = 0.43939508694325846
results_df['value_increased'] = (results_df['value_increased'] > threshold).astype(int)

analyze_column(results_df, 'value_increased')

results_df

In [50]:
# results_df.to_csv('OC-ttS-th75perc.csv', index=False)

In [51]:
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

# Load your data
# Assuming 'train' is your DataFrame with features and target
X = train.drop(columns=['value_increased', 'id'])
y = train['value_increased']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    depth = trial.suggest_int('depth', 4, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
    l2_leaf_reg = trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10)
    iterations = trial.suggest_int('iterations', 100, 1000)
    
    # Create and fit the model
    model = CatBoostRegressor(
        depth=depth,
        learning_rate=learning_rate,
        l2_leaf_reg=l2_leaf_reg,
        iterations=iterations,
        silent=True
    )
    
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Apply threshold to create binary predictions
    threshold = 0.5
    y_pred_binary = (y_pred > threshold).astype(int)
    
    # Calculate the F1 score
    f1 = f1_score(y_test, y_pred_binary, average='micro')
    
    return f1

# Create the Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  # Adjust n_trials as needed

# Print the best parameters and score
print('Best trial:')
trial = study.best_trial
print(f'  F1 Score: {trial.value}')
print('  Best hyperparameters: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

# Use the best parameters to initialize the CatBoost model
best_params = study.best_params
model = CatBoostRegressor(**best_params, silent=True)

# Perform Recursive Feature Elimination (RFE)
rfe = RFE(estimator=model, n_features_to_select=1, step=1)  # Start by ranking all features
rfe.fit(X_train, y_train)

# Get ranking of features
ranking = rfe.ranking_

# Rank features according to importance
feature_ranking = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': ranking
}).sort_values(by='Ranking')

print("Feature Ranking by RFE:")
print(feature_ranking)

# Select the top N features based on RFE ranking
optimal_n_features = 50  # Set this to the desired number of features
top_features = feature_ranking['Feature'].head(optimal_n_features).tolist()

# Re-train the model using only the top features
X_train_optimal = X_train[top_features]
X_test_optimal = X_test[top_features]

# Fit the model with the selected features
model.fit(X_train_optimal, y_train)

# Make predictions and evaluate
y_pred_optimal = model.predict(X_test_optimal)
threshold = 0.5
y_pred_binary_optimal = (y_pred_optimal > threshold).astype(int)

accuracy_optimal = accuracy_score(y_test, y_pred_binary_optimal)
precision_optimal = precision_score(y_test, y_pred_binary_optimal)
recall_optimal = recall_score(y_test, y_pred_binary_optimal)
f1_optimal = f1_score(y_test, y_pred_binary_optimal, average='micro')
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_binary_optimal).ravel()
macro_avg_tnr = tn / (tn + fp)  # True Negative Rate

# Results with selected features
results_optimal = {
    'Accuracy': accuracy_optimal,
    'Precision': precision_optimal,
    'Recall': recall_optimal,
    'Macro Avg TNR': macro_avg_tnr,
    'F1 Score': f1_optimal
}

# Display the results
results_df_optimal = pd.DataFrame([results_optimal])
print("Evaluation Results with Optimal Features:")
print(results_df_optimal)

# Save the final model with selected features
model.save_model('catboost_model_optimal.cbm')

# Load the model for predictions (later use)
model = CatBoostRegressor()
model.load_model('catboost_model_optimal.cbm')

# Prepare test data
X_test_final = test.drop(columns=['id'])
X_test_final_optimal = X_test_final[top_features]  # Use only selected features

# Make predictions on the test set
predicted_values = model.predict(X_test_final_optimal)

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'id': test['id'],  # Include the ID for reference
    'value_increased': predicted_values
})

# Display the results DataFrame
print(results_df)

[I 2024-09-26 19:48:12,834] A new study created in memory with name: no-name-37b74de6-47ca-4896-b565-3f02559c9534
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
  l2_leaf_reg = trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10)
[I 2024-09-26 19:48:14,238] Trial 0 finished with value: 0.946236559139785 and parameters: {'depth': 7, 'learning_rate': 0.003912498907894198, 'l2_leaf_reg': 0.28757600837191283, 'iterations': 908}. Best is trial 0 with value: 0.946236559139785.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
  l2_leaf_reg = trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10)
[I 2024-09-26 19:48:15,730] Trial 1 finished with value: 0.946236559139785 and parameters: {'depth': 8, 'learning_rate': 0.0006479796074036898, 'l2_leaf_reg': 1.4123599046156493e-05, 'iterations': 631}. Best is trial 0 with value: 0.946236559139785.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
  l2_leaf_reg = trial.suggest_loguniform('l2_leaf_r

Best trial:
  F1 Score: 0.956989247311828
  Best hyperparameters: 
    depth: 7
    learning_rate: 0.06164358348865057
    l2_leaf_reg: 0.8657964700720062
    iterations: 136
Feature Ranking by RFE:
                   Feature  Ranking
6                   Weight        1
21            Anticipation        2
38         Natural Fitness        3
40                 Stamina        4
22                 Bravery        5
44               Dribbling        6
20              Aggression        7
51          Penalty Taking        8
26           Determination        9
33               Work Rate       10
39                    Pace       11
49             Long Throws       12
45               Finishing       13
0                  Ability       14
5                   Height       15
4                     Foot       16
50                 Marking       17
32                  Vision       18
23               Composure       19
30             Positioning       20
28              Leadership       21
46       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
analyze_column(results_df, 'value_increased')

Unique values in 'value_increased':
[ 0.21220217  0.1592351   0.26730139  0.07360401  0.22599052  0.14362717
  0.17116703  0.15863473  0.09728347  0.09584243  0.04375677  0.00566043
 -0.00631653  0.07422916  0.17949713  0.38974885  0.05396765  0.11300589
  0.03497136  0.4053752   0.16713215  0.17532941 -0.04354869  0.20455516
  0.15413227  0.10327999  0.31309319  0.10243516  0.14782121  0.25466882
  0.31562119  0.43188046 -0.05321634  0.10353705  0.14072152  0.1143427
  0.09683337  0.19756037  0.11427904  0.20041497  0.09438634  0.05305858
  0.12141893  0.13776131  0.39305126  0.04560407  0.06260216  0.36394359
 -0.01436282  0.15805391  0.14878813  0.09327196  0.13175205  0.27786786
  0.2553976   0.13808298  0.03138278 -0.00080821  0.17812315  0.17814455
  0.29679033  0.06529552  0.27840224  0.03101059  0.29355796  0.07861341
  0.15158203  0.14313508  0.11742926  0.11768713  0.06377564  0.18718988
  0.12698947  0.36357651  0.32402364 -0.02895575  0.25559673  0.22897841
  0.24329025  0.

In [53]:
mean_value = results_df['value_increased'].mean()

# Calculate Standard Deviation
std_dev = results_df['value_increased'].std()

# Calculate 75th Percentile
percentile_75 = results_df['value_increased'].quantile(0.75)

# Calculate Threshold based on Mean + 2 * Standard Deviation
threshold = mean_value + 2 * std_dev

# Display the results
print(f"Mean: {mean_value}")
print(f"Standard Deviation: {std_dev}")
print(f"75th Percentile: {percentile_75}")
print(f"Threshold (Mean + 2 * Std Dev): {threshold}")

Mean: 0.15882509730030858
Standard Deviation: 0.10656117695942516
75th Percentile: 0.22642658174366231
Threshold (Mean + 2 * Std Dev): 0.3719474512191589


In [54]:
#threshold = 0.15778609567572263
results_df['value_increased'] = (results_df['value_increased'] > threshold).astype(int)

In [55]:
analyze_column(results_df, 'value_increased')

Unique values in 'value_increased':
[1 0]
 
Value counts in 'value_increased' (including NaNs):
value_increased
0    169
1    141
Name: count, dtype: int64
 
Number of missing values (NaNs) in 'value_increased':
0
 
count    310.000000
mean       0.454839
std        0.498761
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: value_increased, dtype: float64


In [56]:
#results_df.to_csv('OC-ttS-th75perc.csv', index=False)