In [106]:
import pandas as pd 
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score

warnings.simplefilter("ignore")
sns.set_style('darkgrid')

In [107]:
''' reading dataset '''
df = pd.read_csv('Forbes Richest Atheletes (Forbes Richest Athletes 1990-2020).csv')

In [108]:
''' first 5 rows '''
df.head()

Unnamed: 0,S.NO,Name,Nationality,Current Rank,Previous Year Rank,Sport,Year,earnings ($ million)
0,1,Mike Tyson,USA,1,,boxing,1990,28.6
1,2,Buster Douglas,USA,2,,boxing,1990,26.0
2,3,Sugar Ray Leonard,USA,3,,boxing,1990,13.0
3,4,Ayrton Senna,Brazil,4,,auto racing,1990,10.0
4,5,Alain Prost,France,5,,auto racing,1990,9.0


In [109]:
''' shape of dataset '''
df.shape

(301, 8)

In [110]:
''' checking null values '''
df.isnull().sum()

S.NO                     0
Name                     0
Nationality              0
Current Rank             0
Previous Year Rank      24
Sport                    0
Year                     0
earnings ($ million)     0
dtype: int64

In [111]:
'''  creating a dictonary inf_df '''

''' formula ---> Final value = Initial value * CPI final/CPI initial  '''

inf_df = {'cpi_per_year': [53.2, 56.45, 58.18, 59.87, 61.51, 63.16, 64.76, 66.92, 68.05, 
                          69.15, 71.01, 73.41, 74.55, 76.32, 77.76, 80.29, 83.03, 85.14, 88.62, 
                          88.7, 91.11, 92.47, 95.21, 96.87, 98.33, 99.07, 99.79, 101.86, 104.01, 106],
          
         'year': [1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
                  2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
                 2018, 2019]}

In [112]:
''' creating inf_df dataframe '''
inf_df = pd.DataFrame(inf_df, columns=['year', 'cpi_per_year'])

In [113]:
''' displaying data '''
inf_df

Unnamed: 0,year,cpi_per_year
0,1990,53.2
1,1991,56.45
2,1992,58.18
3,1993,59.87
4,1994,61.51
5,1995,63.16
6,1996,64.76
7,1997,66.92
8,1998,68.05
9,1999,69.15


In [114]:
''' creaing new column "npv" in df '''
npv = []
for y, d in zip(df['Year'], df['earnings ($ million)']): 
    for y_, c_ in zip(inf_df['year'], inf_df['cpi_per_year']):
        if y == y_: 
            d = d*((inf_df.at[29, 'cpi_per_year'])/c_)
            npv.append(round(d, 2))

In [115]:
''' to balance the df we are taking 290 rows only '''
df = df.loc[:290, :]

''' displaying last five rows'''
df.tail()

Unnamed: 0,S.NO,Name,Nationality,Current Rank,Previous Year Rank,Sport,Year,earnings ($ million)
286,287,Russell Wilson,USA,6,??,American Football,2019,89.5
287,288,Aaron Rogers,USA,7,??,American Football,2019,89.3
288,289,LeBron James,USA,8,6,Basketball,2019,89.0
289,290,Stephen Curry,USA,9,8,Basketball,2019,79.8
290,291,Kevin Durant,USA,10,11,Basketball,2019,65.4


In [116]:
''' adding npv into df '''
df['npv'] = npv

''' displaying datas '''
df.head()

Unnamed: 0,S.NO,Name,Nationality,Current Rank,Previous Year Rank,Sport,Year,earnings ($ million),npv
0,1,Mike Tyson,USA,1,,boxing,1990,28.6,56.98
1,2,Buster Douglas,USA,2,,boxing,1990,26.0,51.8
2,3,Sugar Ray Leonard,USA,3,,boxing,1990,13.0,25.9
3,4,Ayrton Senna,Brazil,4,,auto racing,1990,10.0,19.92
4,5,Alain Prost,France,5,,auto racing,1990,9.0,17.93


In [117]:
''' removing columns which contain null values i.e Sport column '''
df.drop(['Previous Year Rank'], axis=1, inplace=True)

In [118]:
''' now again checking null values '''
df.isnull().sum()

S.NO                    0
Name                    0
Nationality             0
Current Rank            0
Sport                   0
Year                    0
earnings ($ million)    0
npv                     0
dtype: int64

In [119]:
''' converting each word in Sport column into lower case '''
def lower_case(word):
    return word.lower()

df['Sport'] = df['Sport'].map(lower_case)

''' now checking unique categories in Sport column '''
df.Sport.unique()

array(['boxing', 'auto racing', 'golf', 'basketball', 'tennis', 'nfl',
       'nba', 'baseball', 'ice hockey', 'american football / baseball',
       'f1 motorsports', 'nascar', 'hockey', 'auto racing (nascar)',
       'f1 racing', 'american football', 'soccer', 'cycling',
       'motorcycle gp', 'mma'], dtype=object)

In [120]:
''' converting Sport category into label encoder '''
lbl_enc = LabelEncoder()

df['Sport'] = lbl_enc.fit_transform(df['Sport'])

In [121]:
''' independent and dependent features '''
X = df[['Current Rank', 'Sport', 'Year']].values
y = df[['npv']].values

In [122]:
X

array([[   1,    6, 1990],
       [   2,    6, 1990],
       [   3,    6, 1990],
       [   4,    2, 1990],
       [   5,    2, 1990],
       [   6,   10, 1990],
       [   7,   10, 1990],
       [   8,    5, 1990],
       [   8,   10, 1990],
       [   8,    6, 1990],
       [   1,    6, 1991],
       [   2,    6, 1991],
       [   3,    5, 1991],
       [   4,    6, 1991],
       [   5,    2, 1991],
       [   6,    2, 1991],
       [   7,    6, 1991],
       [   8,   10, 1991],
       [   9,    2, 1991],
       [  10,   10, 1991],
       [   1,    5, 1992],
       [   2,    6, 1992],
       [   3,    2, 1992],
       [   4,    2, 1992],
       [   5,   10, 1992],
       [   6,   19, 1992],
       [   7,   17, 1992],
       [   8,   10, 1992],
       [   9,   19, 1992],
       [  10,   19, 1992],
       [   1,    5, 1993],
       [   2,    6, 1993],
       [   3,    2, 1993],
       [   4,    2, 1993],
       [   5,    6, 1993],
       [   6,   16, 1993],
       [   7,    6, 1993],
 

In [123]:
y

array([[ 56.98],
       [ 51.8 ],
       [ 25.9 ],
       [ 19.92],
       [ 17.93],
       [ 17.14],
       [ 16.94],
       [ 16.14],
       [ 16.14],
       [ 16.14],
       [113.6 ],
       [ 59.15],
       [ 30.04],
       [ 27.23],
       [ 24.41],
       [ 20.66],
       [ 19.15],
       [ 17.46],
       [ 16.9 ],
       [ 15.96],
       [ 65.41],
       [ 51.01],
       [ 40.08],
       [ 26.42],
       [ 20.22],
       [ 20.04],
       [ 17.31],
       [ 16.76],
       [ 16.4 ],
       [ 15.49],
       [ 63.74],
       [ 44.26],
       [ 32.75],
       [ 28.33],
       [ 27.97],
       [ 26.91],
       [ 26.56],
       [ 22.49],
       [ 22.31],
       [ 20.36],
       [ 51.7 ],
       [ 28.78],
       [ 25.5 ],
       [ 23.44],
       [ 23.26],
       [ 23.26],
       [ 20.85],
       [ 20.68],
       [ 19.65],
       [ 19.47],
       [ 73.68],
       [ 67.13],
       [ 37.76],
       [ 37.26],
       [ 36.75],
       [ 30.21],
       [ 26.85],
       [ 25.34],
       [ 25.17

In [124]:
''' train test split '''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [125]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

X_train shape:  (232, 3)
X_test shape:  (59, 3)


In [126]:
''' Scaling data '''
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [127]:
X_train

array([[-0.48976556, -0.59330076, -0.93772678],
       [-1.51960492, -0.59330076, -1.39597347],
       [ 0.5400738 , -0.41615047, -1.16685013],
       [-0.48976556,  1.88680331,  1.46806831],
       [ 0.88335359,  1.70965302,  0.89525996],
       [-0.14648577,  1.88680331,  0.89525996],
       [ 0.88335359,  0.2924507 , -1.39597347],
       [ 0.5400738 ,  1.88680331, -0.02123341],
       [ 1.22663338,  0.2924507 ,  1.23894497],
       [ 0.88335359, -0.77045105, -1.2814118 ],
       [ 0.19679402,  0.2924507 ,  0.55157494],
       [ 1.22663338, -1.47905221,  0.20788993],
       [-0.48976556, -0.59330076,  0.89525996],
       [ 0.19679402, -0.59330076,  0.3224516 ],
       [-0.83304534, -0.41615047,  0.20788993],
       [-1.17632513,  1.70965302,  1.69719166],
       [-1.17632513,  0.2924507 , -0.4794801 ],
       [ 0.88335359,  1.70965302,  1.00982163],
       [ 0.5400738 , -0.59330076, -0.82316511],
       [ 0.19679402, -0.59330076, -0.02123341],
       [ 0.5400738 , -0.41615047, -0.250

In [128]:
''' Random Forest Regressor '''
rf_reg = RandomForestRegressor()

''' fit on data'''
rf_reg.fit(X_train, y_train)

RandomForestRegressor()

In [129]:
''' prediction '''
pred = rf_reg.predict(X_test)

In [130]:
''' checking performance of the model '''
print("Mean Absolute Error: ", mean_absolute_error(y_test, pred))
print("R2 Score: ", r2_score(y_test, pred))

Mean Absolute Error:  4.237189830508475
R2 Score:  0.9496003994230373


In [138]:
''' reshaping y_test variable from (59, 1) to (59,)'''
y_test = y_test.reshape(59,)

In [139]:
pd.DataFrame({'Actual': y_test, 'Predicted': pred})

Unnamed: 0,Actual,Predicted
0,53.62,57.6069
1,105.0,95.1232
2,38.97,41.9388
3,137.55,124.6968
4,31.28,33.5018
5,110.07,92.0496
6,55.44,44.7102
7,38.3,40.8521
8,33.33,35.5936
9,23.26,24.1829
