In [29]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pickle

import warnings
warnings.filterwarnings('ignore')

In [30]:
def onehot(df: pd.DataFrame):
    encoder = OneHotEncoder(drop='first').fit(pd.DataFrame(df))
    encoded = encoder.transform(pd.DataFrame(df)).toarray()
    cols = encoder.get_feature_names_out(input_features=df.columns)
    cols = list(map(normit, cols))  
    onehot_encoded_cats = pd.DataFrame(encoded, columns=cols)
    return onehot_encoded_cats


def normit(x: str):
    return x.replace(' ', '_').replace('/', '_').lower()


def get_height(x: str):
    feet, inch = x.split("'")
    inch = inch.rstrip('"')
    return int(feet)*30.48 + int(inch)*2.54


def get_value(x):
    x = str(x)
    if x[-1] == 'M':
        x = x[1:-1]
        return 1000000 * float(x)
    elif x[-1] == 'K':
        x = x[1:-1]
        return 1000 * float(x)
    else:
        x = x[1:]
        return float(x)


def get_stars(x):
    return int(x[0])

# commented lines are from initial model
def clean_fifa_df(df: pd.DataFrame):
    df.columns = list(map(normit, df.columns))
    # dropped = df[[
    #     'age', 'growth', 'value', 'wage', 'reactions', 'power', 'shot_power', 'vision', 
    #     'composure', 'total_stats', 'base_stats', 'pas', 'dri', 'phy', 
    #     'ova']]
    dropped = df[['age', 'bp', 'height', 'weight', 'foot', 'growth', 'value', 'wage',
       'release_clause', 'attacking', 'crossing', 'finishing',
       'heading_accuracy', 'short_passing', 'volleys', 'skill', 'dribbling',
       'curve', 'fk_accuracy', 'long_passing', 'ball_control', 'movement',
       'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance',
       'power', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots',
       'mentality', 'aggression', 'interceptions', 'positioning', 'vision',
       'penalties', 'composure', 'defending', 'marking', 'standing_tackle',
       'sliding_tackle', 'goalkeeping', 'gk_diving', 'gk_handling',
       'gk_kicking', 'gk_positioning', 'gk_reflexes', 'total_stats',
       'base_stats', 'w_f', 'sm', 'a_w', 'd_w', 'ir', 'pac', 'sho', 'pas',
       'dri', 'def', 'phy', 'ova']]

    dropped['composure'] = dropped['composure'].fillna(np.mean(dropped['composure'])).copy()
    dropped['a_w'] = dropped['a_w'].fillna('Medium').copy()
    dropped['d_w'] = dropped['d_w'].fillna('Medium').copy()
    dropped = dropped.dropna(subset=['vision'])
    dropped['height'] = dropped['height'].apply(get_height)
    dropped['weight_lbs'] = dropped['weight'].str.rstrip("lbs").astype(int)
    dropped = dropped.drop(['weight'], axis=1)
    dropped['value'] = dropped['value'].apply(get_value)
    dropped['wage'] = dropped['wage'].apply(get_value)
    dropped['release_clause'] = dropped['release_clause'].apply(get_value) 
    dropped['w_f'] = dropped['w_f'].apply(get_stars)
    dropped['sm'] = dropped['sm'].apply(get_stars)
    dropped['ir'] = dropped['ir'].apply(get_stars)

    return dropped


In [31]:
def preprocess(df: pd.DataFrame):
    cleaned = clean_fifa_df(df)
    y = cleaned['ova']
    cleaned = cleaned.drop(['ova'], axis=1)
    nums = cleaned.select_dtypes(include=np.number)
    cats = cleaned.select_dtypes(include=object)
    # nums = minmax(nums)
    # cats = onehot(cats)
    X = pd.concat((nums, cats), axis=1)
    return X, y

In [32]:
# minmax on x_train and same transformer on test, NOT minmax an train+test   ?????????????

In [33]:
validate = pd.read_csv('fifa21_validate.csv')
train = pd.read_csv('fifa21_train.csv')
validate.columns == train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [34]:
def scale_and_onehot(df, scaler: MinMaxScaler):
    numsi = df.select_dtypes(include=np.number)
    cats = df.select_dtypes(include=object)
    cats = onehot(cats)
    nums = scaler.transform(numsi)
    nums = pd.DataFrame(nums, columns=numsi.columns)
    return pd.concat((nums, cats), axis=1)
    
X, y = preprocess(train)
X2, y2 = preprocess(validate)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

nums_train = X_train.select_dtypes(include=np.number)

minmax = MinMaxScaler().fit(nums_train)

X_train = scale_and_onehot(X_train, minmax)
X_test = scale_and_onehot(X_test, minmax)
X2 = scale_and_onehot(X2, minmax)

In [36]:
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

preds_test = lm.predict(X_test)
print(r2_score(y_test, preds_test))

preds = lm.predict(X2)
print(r2_score(y2, preds))

0.918610521630802
0.9163031339732207


In [37]:
# trying to figure out where the validating goes wrong

pd.set_option('display.max_columns', None)

print(y.shape)
print(y2.shape)
display(X.describe())
print(X.shape)
print(X_train.shape)
print(X_test.shape)
display(X2.describe())
print(X2.shape)

(11660,)
(1996,)


Unnamed: 0,age,height,growth,value,wage,release_clause,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,total_stats,base_stats,w_f,sm,ir,pac,sho,pas,dri,def,phy,weight_lbs
count,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0,11660.0
mean,25.254631,181.224208,5.530703,2552716.0,10471.526587,4542736.0,258.549657,51.575129,48.023499,53.459777,60.423928,45.067324,266.572213,57.836021,49.617839,44.313379,54.202744,60.60223,323.041424,65.423585,65.436621,64.582075,62.889794,64.709348,302.26072,59.712007,65.036621,63.213036,65.164322,49.134734,261.807633,56.932762,46.764923,52.445455,55.439194,50.2253,59.947732,140.837393,46.966123,48.043568,45.827702,77.720669,15.620926,15.492539,15.346312,15.517153,15.743739,1630.789708,361.053087,2.982504,2.45789,1.143654,68.092967,55.026672,58.884134,64.213122,50.019297,64.816895,165.574185
std,4.947334,6.914567,5.814483,5274622.0,21244.249045,10138050.0,72.295006,17.865896,19.385162,16.934545,13.959573,17.732902,76.897596,18.049328,18.180408,17.472557,14.884652,15.874732,54.933247,14.746172,14.527761,14.484351,9.006966,14.131318,49.600005,13.489431,12.017118,15.358095,12.761199,19.048341,62.827366,17.062503,20.791353,19.054488,13.695285,15.676946,11.649121,61.589358,20.17911,21.468511,21.217062,81.315093,16.860437,16.162271,15.811398,16.417675,17.162589,260.266231,40.235246,0.675781,0.778954,0.439922,11.252154,13.837064,10.161584,9.807939,16.837851,9.751233,15.821873
min,16.0,154.94,0.0,0.0,0.0,0.0,42.0,6.0,3.0,5.0,11.0,4.0,43.0,5.0,4.0,5.0,9.0,5.0,113.0,13.0,11.0,14.0,24.0,17.0,133.0,12.0,25.0,12.0,20.0,4.0,55.0,9.0,5.0,2.0,10.0,8.0,12.0,20.0,3.0,6.0,6.0,12.0,2.0,2.0,2.0,2.0,2.0,731.0,228.0,1.0,1.0,1.0,26.0,17.0,25.0,28.0,12.0,27.0,117.0
25%,21.0,175.26,0.0,375000.0,1000.0,503750.0,232.0,41.0,33.0,46.0,56.0,32.0,232.0,53.0,37.0,32.0,45.0,57.0,295.0,58.0,59.0,57.0,57.0,57.0,271.0,50.0,58.0,56.0,58.0,35.0,235.0,45.0,26.0,43.0,47.0,40.0,53.0,83.0,29.0,28.0,25.0,48.0,8.0,8.0,8.0,8.0,8.0,1492.0,333.0,3.0,2.0,1.0,62.0,46.0,52.0,59.0,35.0,59.0,154.0
50%,25.0,180.34,4.0,800000.0,3000.0,1300000.0,271.0,56.0,52.0,56.0,63.0,47.0,279.0,63.0,52.0,43.0,57.0,64.0,331.0,68.0,68.0,67.0,63.0,67.0,308.0,61.0,66.0,66.0,66.0,54.0,270.0,60.0,52.0,57.0,57.0,51.0,61.0,157.0,52.0,55.0,52.0,53.0,11.0,11.0,11.0,11.0,11.0,1660.0,362.0,3.0,2.0,1.0,69.0,58.0,59.0,65.0,53.0,66.0,165.0
75%,29.0,185.42,10.0,2500000.0,11000.0,3900000.0,306.0,65.0,64.0,65.0,69.0,59.0,319.0,70.0,64.0,58.0,65.0,70.0,361.0,76.0,75.0,75.0,69.0,75.0,339.0,70.0,73.0,73.0,74.0,64.0,304.0,70.0,65.0,66.0,65.0,62.0,68.0,194.0,64.0,66.0,64.0,59.0,14.0,14.0,14.0,14.0,14.0,1812.0,389.0,3.0,3.0,1.0,76.0,65.0,66.0,71.0,64.0,72.0,176.0
max,43.0,205.74,26.0,90000000.0,560000.0,166500000.0,437.0,94.0,95.0,93.0,94.0,90.0,470.0,96.0,94.0,94.0,93.0,96.0,464.0,97.0,96.0,96.0,95.0,97.0,444.0,95.0,95.0,97.0,97.0,94.0,414.0,96.0,90.0,95.0,95.0,94.0,96.0,267.0,92.0,90.0,90.0,439.0,90.0,88.0,88.0,93.0,90.0,2304.0,490.0,5.0,5.0,5.0,96.0,93.0,93.0,95.0,89.0,93.0,243.0


(11660, 63)
(9328, 78)
(2332, 78)


Unnamed: 0,age,height,growth,value,wage,release_clause,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,total_stats,base_stats,w_f,sm,ir,pac,sho,pas,dri,def,phy,weight_lbs,bp_cb,bp_cdm,bp_cf,bp_cm,bp_gk,bp_lb,bp_lm,bp_lw,bp_lwb,bp_rb,bp_rm,bp_rw,bp_rwb,bp_st,foot_right,a_w_low,a_w_medium,d_w_low,d_w_medium
count,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0,1996.0
mean,0.346805,0.51731,0.214208,0.028751,0.018752,0.028149,0.545264,0.51749,0.485939,0.548204,0.59487,0.471134,0.521696,0.575739,0.503713,0.444462,0.53751,0.608018,0.59739,0.630025,0.639125,0.616526,0.549091,0.594069,0.545612,0.57669,0.574943,0.603083,0.590415,0.498336,0.576267,0.55078,0.498485,0.536029,0.536532,0.489939,0.568546,0.496359,0.511385,0.504551,0.478946,0.158969,0.161044,0.160594,0.160507,0.152839,0.161192,0.57303,0.50409,0.497871,0.358843,0.039078,0.611564,0.501615,0.502358,0.542047,0.508603,0.577261,0.378713,0.190882,0.076653,0.00501,0.062625,0.1002,0.051603,0.049098,0.011523,0.013527,0.045591,0.082164,0.020541,0.01503,0.14479,0.747495,0.041082,0.660822,0.097194,0.735972
std,0.179891,0.133461,0.22846,0.060604,0.035891,0.064718,0.185588,0.20537,0.211838,0.196898,0.168813,0.208384,0.18369,0.201968,0.204543,0.200907,0.177829,0.179173,0.154483,0.175942,0.168139,0.174281,0.126505,0.174555,0.157343,0.158021,0.167416,0.182032,0.162246,0.213838,0.179362,0.196922,0.245049,0.208672,0.161875,0.184707,0.137383,0.250182,0.232006,0.25633,0.250724,0.199289,0.199966,0.196222,0.191712,0.188899,0.204565,0.166744,0.155322,0.16911,0.194268,0.116769,0.156632,0.181272,0.148134,0.146426,0.219072,0.145555,0.126601,0.393095,0.266107,0.070622,0.242348,0.300342,0.22128,0.216127,0.106752,0.115545,0.208649,0.274684,0.141878,0.121703,0.351977,0.434558,0.19853,0.473549,0.296297,0.440925
min,0.0,0.15,0.0,0.0,0.0,0.0,0.01519,0.0,0.01087,0.034091,0.0,0.0,0.021077,0.0,0.022222,0.022472,0.011905,0.054945,0.025641,-0.012048,0.023529,0.04878,0.098592,0.0375,0.012862,0.060241,-0.042857,-0.011765,0.064935,0.0,0.008357,0.011494,-0.011765,0.021505,0.0,-0.011628,0.0,0.020325,0.022989,0.02381,0.02381,0.007026,0.011364,0.0,-0.011628,0.0,0.0,0.013986,0.066148,0.0,0.0,0.0,0.057971,0.039474,0.058824,0.029851,0.013158,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.185185,0.4375,0.0,0.004167,0.001786,0.003267,0.475949,0.394886,0.315217,0.454545,0.542169,0.325581,0.439696,0.516484,0.366667,0.303371,0.428571,0.56044,0.518519,0.542169,0.564706,0.52439,0.478873,0.4875,0.446945,0.457831,0.471429,0.505882,0.493506,0.344444,0.498607,0.413793,0.247059,0.430108,0.423529,0.372093,0.488095,0.268293,0.310345,0.261905,0.22619,0.084309,0.068182,0.069767,0.069767,0.065934,0.068182,0.487603,0.396887,0.5,0.25,0.0,0.521739,0.381579,0.397059,0.462687,0.315789,0.484848,0.282258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.333333,0.5,0.16,0.008889,0.00625,0.007808,0.579747,0.568182,0.532609,0.579545,0.626506,0.488372,0.550351,0.637363,0.522222,0.426966,0.571429,0.648352,0.623932,0.662651,0.670588,0.646341,0.549296,0.625,0.565916,0.60241,0.585714,0.635294,0.61039,0.555556,0.598886,0.586207,0.576471,0.591398,0.552941,0.511628,0.571429,0.567073,0.568966,0.595238,0.559524,0.096019,0.102273,0.104651,0.104651,0.098901,0.102273,0.591863,0.503891,0.5,0.25,0.0,0.623188,0.539474,0.514706,0.552239,0.552632,0.590909,0.370968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
75%,0.481481,0.6,0.36,0.026667,0.017857,0.024024,0.665823,0.670455,0.663043,0.681818,0.710843,0.627907,0.64637,0.706044,0.655556,0.606742,0.666667,0.714286,0.704416,0.746988,0.752941,0.743902,0.633803,0.7125,0.662379,0.698795,0.685714,0.729412,0.701299,0.666667,0.696379,0.701149,0.705882,0.688172,0.658824,0.627907,0.666667,0.707317,0.701149,0.714286,0.690476,0.11007,0.136364,0.139535,0.139535,0.131868,0.136364,0.687222,0.610895,0.5,0.5,0.0,0.710145,0.631579,0.602941,0.641791,0.684211,0.681818,0.459677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
max,1.148148,0.9,1.0,0.866667,0.446429,0.957357,0.918987,0.943182,0.956522,1.0,0.975904,0.94186,0.911007,0.956044,0.977778,0.988764,0.964286,0.945055,0.988604,0.987952,0.976471,0.97561,0.957746,0.975,0.909968,0.951807,0.971429,0.964706,0.948052,0.944444,1.019499,0.977011,0.976471,0.956989,0.952941,0.976744,0.952381,1.02439,1.045977,1.0,1.0,0.995316,0.965909,1.046512,1.05814,0.967033,1.0,0.978385,1.031128,1.0,1.0,0.75,0.985507,0.986842,1.0,0.940299,1.026316,0.954545,0.870968,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


(1996, 78)


In [19]:
### pickle test

# X, y = preprocess(validate)

# model = pickle.load(open('model.sav', 'rb'))

# preds = model.predict(X)
# print(r2_score(y, preds))