In [51]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pickle

import warnings
warnings.filterwarnings('ignore')

In [78]:
def minmax(df: pd.DataFrame):
    minmax = MinMaxScaler().fit(df)
    X_norm_m = minmax.transform(df)
    X_norm_m = pd.DataFrame(X_norm_m, columns=df.columns)
    return X_norm_m


def onehot(df: pd.DataFrame):
    encoder = OneHotEncoder(drop='first').fit(pd.DataFrame(df))
    encoded = encoder.transform(pd.DataFrame(df)).toarray()
    cols = encoder.get_feature_names_out(input_features=df.columns)
    cols = list(map(normit, cols))  
    onehot_encoded_cats = pd.DataFrame(encoded, columns=cols)
    return onehot_encoded_cats


def normit(x: str):
    return x.replace(' ', '_').replace('/', '_').lower()


def get_height(x: str):
    feet, inch = x.split("'")
    inch = inch.rstrip('"')
    return int(feet)*30.48 + int(inch)*2.54


def get_value(x):
    x = str(x)
    if x[-1] == 'M':
        x = x[1:-1]
        return 1000000 * float(x)
    elif x[-1] == 'K':
        x = x[1:-1]
        return 1000 * float(x)
    else:
        x = x[1:]
        return float(x)


def get_stars(x):
    return int(x[0])

# commented lines are from initial model
def clean_fifa_df(df: pd.DataFrame):
    # dropped = df[['Age', 'BP', 'Height', 'Weight', 'foot', 'Growth', 'Value', 'Wage', 'Release Clause', 'Attacking', 'Crossing', 'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys', 'Skill', 'Dribbling', 'Curve', 'FK Accuracy', 'Long Passing', 'Ball Control', 'Movement', 'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance', 'Power', 'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots', 'Mentality', 'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure', 'Defending', 'Marking', 'Standing Tackle', 'Sliding Tackle', 'Goalkeeping', 'GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes', 'Total Stats', 'Base Stats', 'W/F', 'SM', 'A/W', 'D/W', 'IR', 'PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY', 'OVA']]
    # dropped.columns = list(map(normit, dropped.columns))
    
    df.columns = list(map(normit, df.columns))
    dropped = df[[
        'age', 'growth', 'value', 'wage', 'reactions', 'power', 'shot_power', 'vision', 
        'composure', 'total_stats', 'base_stats', 'pas', 'dri', 'phy', 
        'ova'
        'd_w'
        ]]

    dropped['composure'] = dropped['composure'].fillna(np.mean(dropped['composure'])).copy()
    # dropped['a_w'] = dropped['a_w'].fillna('Medium').copy()
    dropped['d_w'] = dropped['d_w'].fillna('Medium').copy()
    dropped = dropped.dropna(subset=['vision'])
    # dropped['height'] = dropped['height'].apply(get_height)
    # dropped['weight_lbs'] = dropped['weight'].str.rstrip("lbs").astype(int)
    # dropped = dropped.drop(['weight'], axis=1)
    dropped['value'] = dropped['value'].apply(get_value)
    dropped['wage'] = dropped['wage'].apply(get_value)
    # dropped['release_clause'] = dropped['release_clause'].apply(get_value) 
    # dropped['w_f'] = dropped['w_f'].apply(get_stars)
    # dropped['sm'] = dropped['sm'].apply(get_stars)
    # dropped['ir'] = dropped['ir'].apply(get_stars)

    return dropped


In [79]:
def preprocess(df: pd.DataFrame):
    cleaned = clean_fifa_df(df)
    y = cleaned['ova']
    cleaned = cleaned.drop(['ova'], axis=1)
    nums = cleaned.select_dtypes(include=np.number)
    cats = cleaned.select_dtypes(include=object)
    nums = minmax(nums)
    cats = onehot(cats)
    X = pd.concat((nums, cats), axis=1)
    return X, y

In [80]:
validate = pd.read_csv('fifa21_validate.csv')
train = pd.read_csv('fifa21_train.csv')
validate.columns == train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [81]:
X, y = preprocess(train)
X2, y2 = preprocess(validate)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

preds_test = lm.predict(X_test)
print(r2_score(y_test, preds_test))

preds = lm.predict(X2)
print(r2_score(y2, preds))

KeyError: 'd_w'

In [None]:
### pickle test

# X, y = preprocess(validate)

# model = pickle.load(open('model.sav', 'rb'))

# preds = model.predict(X)
# print(r2_score(y, preds))