In [9]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pickle

import warnings
warnings.filterwarnings('ignore')

In [10]:
def minmax(df: pd.DataFrame):
    minmax = MinMaxScaler().fit(df)
    X_norm_m = minmax.transform(df)
    X_norm_m = pd.DataFrame(X_norm_m, columns=df.columns)
    return X_norm_m


def onehot(df: pd.DataFrame):
    encoder = OneHotEncoder(drop='first').fit(pd.DataFrame(df))
    encoded = encoder.transform(pd.DataFrame(df)).toarray()
    cols = encoder.get_feature_names_out(input_features=df.columns)
    cols = list(map(normit, cols))  
    onehot_encoded_cats = pd.DataFrame(encoded, columns=cols)
    return onehot_encoded_cats


def normit(x: str):
    return x.replace(' ', '_').replace('/', '_').lower()


def get_height(x: str):
    feet, inch = x.split("'")
    inch = inch.rstrip('"')
    return int(feet)*30.48 + int(inch)*2.54


def get_value(x):
    x = str(x)
    if x[-1] == 'M':
        x = x[1:-1]
        return 1000000 * float(x)
    elif x[-1] == 'K':
        x = x[1:-1]
        return 1000 * float(x)
    else:
        x = x[1:]
        return float(x)


def get_stars(x):
    return int(x[0])

# commented lines are from initial model
def clean_fifa_df(df: pd.DataFrame):
    # dropped = df[['Age', 'BP', 'Height', 'Weight', 'foot', 'Growth', 'Value', 'Wage', 'Release Clause', 'Attacking', 
    # 'Crossing', 'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys', 'Skill', 'Dribbling', 'Curve', 'FK Accuracy', 
    # 'Long Passing', 'Ball Control', 'Movement', 'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance', 'Power', 
    # 'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots', 'Mentality', 'Aggression', 'Interceptions', 'Positioning', 
    # 'Vision', 'Penalties', 'Composure', 'Defending', 'Marking', 'Standing Tackle', 'Sliding Tackle', 'Goalkeeping', 'GK Diving', 
    # 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes', 'Total Stats', 'Base Stats', 'W/F', 'SM', 'A/W', 'D/W', 'IR', 
    # 'PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY', 'OVA']]
    # dropped.columns = list(map(normit, dropped.columns))
    
    df.columns = list(map(normit, df.columns))
    # dropped = df[[
    #     'age', 'growth', 'value', 'wage', 'reactions', 'power', 'shot_power', 'vision', 
    #     'composure', 'total_stats', 'base_stats', 'pas', 'dri', 'phy', 
    #     'ova']]
    dropped = df[['age', 'bp', 'height', 'weight', 'foot', 'growth', 'value', 'wage',
       'release_clause', 'attacking', 'crossing', 'finishing',
       'heading_accuracy', 'short_passing', 'volleys', 'skill', 'dribbling',
       'curve', 'fk_accuracy', 'long_passing', 'ball_control', 'movement',
       'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance',
       'power', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots',
       'mentality', 'aggression', 'interceptions', 'positioning', 'vision',
       'penalties', 'composure', 'defending', 'marking', 'standing_tackle',
       'sliding_tackle', 'goalkeeping', 'gk_diving', 'gk_handling',
       'gk_kicking', 'gk_positioning', 'gk_reflexes', 'total_stats',
       'base_stats', 'w_f', 'sm', 'a_w', 'd_w', 'ir', 'pac', 'sho', 'pas',
       'dri', 'def', 'phy', 'ova']]

    dropped['composure'] = dropped['composure'].fillna(np.mean(dropped['composure'])).copy()
    dropped['a_w'] = dropped['a_w'].fillna('Medium').copy()
    dropped['d_w'] = dropped['d_w'].fillna('Medium').copy()
    dropped = dropped.dropna(subset=['vision'])
    dropped['height'] = dropped['height'].apply(get_height)
    dropped['weight_lbs'] = dropped['weight'].str.rstrip("lbs").astype(int)
    dropped = dropped.drop(['weight'], axis=1)
    dropped['value'] = dropped['value'].apply(get_value)
    dropped['wage'] = dropped['wage'].apply(get_value)
    dropped['release_clause'] = dropped['release_clause'].apply(get_value) 
    dropped['w_f'] = dropped['w_f'].apply(get_stars)
    dropped['sm'] = dropped['sm'].apply(get_stars)
    dropped['ir'] = dropped['ir'].apply(get_stars)

    return dropped


In [11]:
def preprocess(df: pd.DataFrame):
    cleaned = clean_fifa_df(df)
    y = cleaned['ova']
    cleaned = cleaned.drop(['ova'], axis=1)
    nums = cleaned.select_dtypes(include=np.number)
    cats = cleaned.select_dtypes(include=object)
    nums = minmax(nums)
    cats = onehot(cats)
    X = pd.concat((nums, cats), axis=1)
    return X, y

In [20]:
validate = pd.read_csv('fifa21_validate.csv')
train = pd.read_csv('fifa21_train.csv')
validate.columns == train.columns

pd.DataFrame(validate.isna().sum()).to_csv('v.csv')

In [13]:
X, y = preprocess(train)
X2, y2 = preprocess(validate)

X.columns == X2.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

preds_test = lm.predict(X_test)
print(r2_score(y_test, preds_test))

preds = lm.predict(X2)
print(r2_score(y2, preds))

0.9178470459795224
-1.1629065325924342e+22


In [33]:
pd.set_option('display.max_columns', None)
display(X.describe())
X2.describe()

Unnamed: 0,age,height,growth,value,wage,release_clause,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,total_stats,base_stats,w_f,sm,ir,pac,sho,pas,dri,def,phy,weight_lbs,bp_cb,bp_cdm,bp_cf,bp_cm,bp_gk,bp_lb,bp_lm,bp_lw,bp_lwb,bp_rb,bp_rm,bp_rw,bp_rwb,bp_st,foot_right,a_w_low,a_w_medium,d_w_low,d_w_medium
count,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0
mean,0.342764,0.517406,0.212719,0.028364,0.018699,0.027284,0.548227,0.517899,0.489386,0.550679,0.595469,0.477527,0.523588,0.580616,0.506865,0.441723,0.538128,0.611014,0.598409,0.62409,0.640431,0.616855,0.547744,0.596367,0.544247,0.574843,0.571952,0.602506,0.58655,0.501497,0.576066,0.550951,0.491352,0.542424,0.534579,0.490992,0.570806,0.48922,0.494001,0.500519,0.474139,0.153913,0.154783,0.15689,0.15519,0.14854,0.156179,0.572021,0.507836,0.495626,0.364473,0.035913,0.601328,0.500351,0.498296,0.540494,0.493757,0.572983,0.385509,0.185506,0.071612,0.004117,0.056175,0.092539,0.054202,0.047942,0.012607,0.014751,0.054031,0.083105,0.019554,0.014151,0.158233,0.754288,0.051458,0.644254,0.108491,0.717667
std,0.183235,0.136114,0.223634,0.058607,0.037936,0.060889,0.183025,0.203022,0.210708,0.192438,0.168188,0.206197,0.180088,0.198344,0.202005,0.196321,0.177198,0.174448,0.156505,0.17555,0.170915,0.176638,0.126859,0.176641,0.159486,0.162523,0.171673,0.180683,0.16573,0.211648,0.175007,0.196121,0.244604,0.204887,0.161121,0.18229,0.13868,0.24935,0.226732,0.255578,0.252584,0.190433,0.191596,0.187933,0.183853,0.180414,0.195029,0.165459,0.15357,0.168945,0.194739,0.10998,0.160745,0.182067,0.149435,0.146387,0.218673,0.147746,0.12557,0.388724,0.257856,0.064032,0.230269,0.289797,0.226426,0.213652,0.111577,0.120561,0.226088,0.276052,0.138468,0.118118,0.364975,0.430527,0.220939,0.478759,0.311012,0.450154
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.185185,0.4,0.0,0.004167,0.001786,0.003026,0.481013,0.397727,0.326087,0.465909,0.542169,0.325581,0.442623,0.527473,0.366667,0.303371,0.428571,0.571429,0.518519,0.535714,0.564706,0.52439,0.464789,0.5,0.44373,0.457831,0.471429,0.517647,0.493506,0.344444,0.501393,0.413793,0.247059,0.44086,0.435294,0.372093,0.488095,0.255061,0.292135,0.261905,0.22619,0.084309,0.068182,0.069767,0.069767,0.065934,0.068182,0.483789,0.400763,0.5,0.25,0.0,0.514286,0.381579,0.397059,0.462687,0.298701,0.484848,0.293651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.333333,0.5,0.153846,0.008889,0.005357,0.007808,0.579747,0.568182,0.532609,0.579545,0.626506,0.5,0.552693,0.637363,0.533333,0.426966,0.571429,0.648352,0.621083,0.654762,0.670588,0.646341,0.549296,0.625,0.562701,0.590361,0.585714,0.635294,0.597403,0.555556,0.598886,0.586207,0.552941,0.591398,0.552941,0.5,0.583333,0.554656,0.550562,0.583333,0.547619,0.096019,0.102273,0.104651,0.104651,0.098901,0.102273,0.590591,0.51145,0.5,0.25,0.0,0.614286,0.539474,0.5,0.552239,0.532468,0.590909,0.380952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
75%,0.481481,0.6,0.384615,0.027778,0.019643,0.023423,0.668354,0.670455,0.663043,0.681818,0.698795,0.639535,0.64637,0.714286,0.666667,0.595506,0.666667,0.714286,0.706553,0.75,0.752941,0.743902,0.633803,0.725,0.662379,0.698795,0.685714,0.717647,0.701299,0.666667,0.693593,0.701149,0.705882,0.688172,0.647059,0.627907,0.666667,0.704453,0.685393,0.714286,0.690476,0.11007,0.136364,0.139535,0.139535,0.131868,0.136364,0.687222,0.614504,0.5,0.5,0.0,0.714286,0.631579,0.602941,0.641791,0.675325,0.681818,0.468254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,age,height,growth,value,wage,release_clause,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,total_stats,base_stats,w_f,sm,ir,pac,sho,pas,dri,def,phy,weight_lbs,bp_cb,bp_cdm,bp_cf,bp_cm,bp_gk,bp_lb,bp_lm,bp_lw,bp_lwb,bp_rb,bp_rm,bp_rw,bp_rwb,bp_st,foot_right,a_w_low,a_w_medium,d_w_low,d_w_medium
count,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0
mean,0.302056,0.489746,0.214208,0.033174,0.042005,0.029403,0.586497,0.548664,0.502373,0.532259,0.609559,0.500216,0.562537,0.60221,0.503886,0.436711,0.551885,0.621354,0.593739,0.642073,0.645996,0.612567,0.524352,0.593674,0.593854,0.57926,0.609098,0.629663,0.595028,0.527649,0.561652,0.558546,0.516324,0.550008,0.563027,0.507468,0.596973,0.474106,0.477421,0.492467,0.466237,0.153744,0.156808,0.153457,0.160909,0.158049,0.161192,0.579681,0.453835,0.497871,0.358843,0.052104,0.596842,0.487816,0.471255,0.562576,0.48901,0.563144,0.434819,0.190882,0.076653,0.00501,0.062625,0.1002,0.051603,0.049098,0.011523,0.013527,0.045591,0.082164,0.020541,0.01503,0.14479,0.747495,0.041082,0.660822,0.097194,0.735972
std,0.156679,0.177948,0.22846,0.069928,0.080397,0.067601,0.205342,0.217742,0.224012,0.203847,0.172982,0.221247,0.20641,0.211254,0.214057,0.207915,0.18672,0.201293,0.160424,0.175942,0.176443,0.18804,0.147244,0.186191,0.17539,0.17724,0.165058,0.186418,0.18372,0.226417,0.177385,0.203955,0.247966,0.223064,0.169869,0.18688,0.144252,0.249169,0.226793,0.262582,0.25684,0.20165,0.209489,0.187501,0.179209,0.195339,0.204565,0.1729,0.160958,0.16911,0.194268,0.155692,0.168869,0.191342,0.157392,0.160829,0.216227,0.168538,0.145357,0.393095,0.266107,0.070622,0.242348,0.300342,0.22128,0.216127,0.106752,0.115545,0.208649,0.274684,0.141878,0.121703,0.351977,0.434558,0.19853,0.473549,0.296297,0.440925
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.16129,0.383333,0.0,0.004808,0.004,0.003413,0.509804,0.418675,0.321839,0.435294,0.555556,0.345679,0.470395,0.54023,0.360465,0.290698,0.4375,0.567901,0.511834,0.554217,0.567901,0.513158,0.442623,0.48,0.483871,0.445946,0.507042,0.53012,0.485294,0.364706,0.484848,0.416667,0.261905,0.436782,0.444444,0.388235,0.5125,0.246964,0.280899,0.243902,0.207317,0.078199,0.059524,0.066667,0.076087,0.068182,0.068182,0.491101,0.342742,0.5,0.25,0.0,0.5,0.361111,0.359375,0.47541,0.298701,0.45614,0.324074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.290323,0.466667,0.16,0.010256,0.014,0.008156,0.62465,0.60241,0.551724,0.564706,0.641975,0.518519,0.594737,0.666667,0.523256,0.418605,0.5875,0.666667,0.621302,0.674699,0.679012,0.644737,0.52459,0.626667,0.616487,0.608108,0.619718,0.662651,0.617647,0.588235,0.584022,0.595238,0.595238,0.609195,0.580247,0.529412,0.6,0.544534,0.533708,0.585366,0.54878,0.090047,0.095238,0.1,0.108696,0.102273,0.102273,0.599209,0.453629,0.5,0.25,0.0,0.609375,0.527778,0.484375,0.57377,0.532468,0.578947,0.425926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
75%,0.419355,0.6,0.36,0.030769,0.04,0.025094,0.719888,0.710843,0.689655,0.670588,0.728395,0.666667,0.702632,0.738506,0.662791,0.604651,0.6875,0.740741,0.704882,0.759036,0.765432,0.75,0.622951,0.72,0.724014,0.716216,0.71831,0.759036,0.720588,0.705882,0.680441,0.714286,0.72619,0.712644,0.691358,0.647059,0.7,0.684211,0.662921,0.707317,0.682927,0.104265,0.130952,0.133333,0.141304,0.136364,0.136364,0.698088,0.564516,0.5,0.5,0.0,0.703125,0.625,0.578125,0.672131,0.662338,0.684211,0.527778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
### pickle test

# X, y = preprocess(validate)

# model = pickle.load(open('model.sav', 'rb'))

# preds = model.predict(X)
# print(r2_score(y, preds))