In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [3]:
ipl_auction_df = pd.read_csv('IPL IMB381IPL2013.csv')
ipl_auction_df.head(5)

Unnamed: 0,Sl.NO.,PLAYER NAME,AGE,COUNTRY,TEAM,PLAYING ROLE,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B,...,SR-B,SIXERS,RUNS-C,WKTS,AVE-BL,ECON,SR-BL,AUCTION YEAR,BASE PRICE,SOLD PRICE
0,1,"Abdulla, YA",2,SA,KXIP,Allrounder,0,0,0,0.0,...,0.0,0,307,15,20.47,8.9,13.93,2009,50000,50000
1,2,Abdur Razzak,2,BAN,RCB,Bowler,214,18,657,71.41,...,0.0,0,29,0,0.0,14.5,0.0,2008,50000,50000
2,3,"Agarkar, AB",2,IND,KKR,Bowler,571,58,1269,80.62,...,121.01,5,1059,29,36.52,8.81,24.9,2008,200000,350000
3,4,"Ashwin, R",1,IND,CSK,Bowler,284,31,241,84.56,...,76.32,0,1125,49,22.96,6.23,22.14,2011,100000,850000
4,5,"Badrinath, S",2,IND,CSK,Batsman,63,0,79,45.93,...,120.71,28,0,0,0.0,0.0,0.0,2011,100000,800000


In [4]:
ipl_auction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sl.NO.         130 non-null    int64  
 1   PLAYER NAME    130 non-null    object 
 2   AGE            130 non-null    int64  
 3   COUNTRY        130 non-null    object 
 4   TEAM           130 non-null    object 
 5   PLAYING ROLE   130 non-null    object 
 6   T-RUNS         130 non-null    int64  
 7   T-WKTS         130 non-null    int64  
 8   ODI-RUNS-S     130 non-null    int64  
 9   ODI-SR-B       130 non-null    float64
 10  ODI-WKTS       130 non-null    int64  
 11  ODI-SR-BL      130 non-null    float64
 12  CAPTAINCY EXP  130 non-null    int64  
 13  RUNS-S         130 non-null    int64  
 14  HS             130 non-null    int64  
 15  AVE            130 non-null    float64
 16  SR-B           130 non-null    float64
 17  SIXERS         130 non-null    int64  
 18  RUNS-C    

### df.iloc() - displaying a subset of the dataset

In [6]:
ipl_auction_df.iloc[0:5, 0:10]

Unnamed: 0,Sl.NO.,PLAYER NAME,AGE,COUNTRY,TEAM,PLAYING ROLE,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B
0,1,"Abdulla, YA",2,SA,KXIP,Allrounder,0,0,0,0.0
1,2,Abdur Razzak,2,BAN,RCB,Bowler,214,18,657,71.41
2,3,"Agarkar, AB",2,IND,KKR,Bowler,571,58,1269,80.62
3,4,"Ashwin, R",1,IND,CSK,Bowler,284,31,241,84.56
4,5,"Badrinath, S",2,IND,CSK,Batsman,63,0,79,45.93


### Building the Model- 

In [8]:
X_features = ipl_auction_df.columns
X_features

Index(['Sl.NO.', 'PLAYER NAME', 'AGE', 'COUNTRY', 'TEAM', 'PLAYING ROLE',
       'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL',
       'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C',
       'WKTS', 'AVE-BL', 'ECON', 'SR-BL', 'AUCTION YEAR', 'BASE PRICE',
       'SOLD PRICE'],
      dtype='object')

#### Encoding categorical variables

In [10]:
ipl_auction_df['PLAYING ROLE'].unique()

array(['Allrounder', 'Bowler', 'Batsman', 'W. Keeper'], dtype=object)

In [11]:
pd.get_dummies(ipl_auction_df['PLAYING ROLE'], dtype = float).head()

Unnamed: 0,Allrounder,Batsman,Bowler,W. Keeper
0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0


In [12]:
categorical_features = ['AGE', 'COUNTRY', 'PLAYING ROLE', 'CAPTAINCY EXP']

In [13]:
ipl_auction_encoded_df = pd.get_dummies(ipl_auction_df[X_features], columns = categorical_features,
                                        drop_first = True)

In [14]:
ipl_auction_encoded_df.columns

Index(['Sl.NO.', 'PLAYER NAME', 'TEAM', 'T-RUNS', 'T-WKTS', 'ODI-RUNS-S',
       'ODI-SR-B', 'ODI-WKTS', 'ODI-SR-BL', 'RUNS-S', 'HS', 'AVE', 'SR-B',
       'SIXERS', 'RUNS-C', 'WKTS', 'AVE-BL', 'ECON', 'SR-BL', 'AUCTION YEAR',
       'BASE PRICE', 'SOLD PRICE', 'AGE_2', 'AGE_3', 'COUNTRY_BAN',
       'COUNTRY_ENG', 'COUNTRY_IND', 'COUNTRY_NZ', 'COUNTRY_PAK', 'COUNTRY_SA',
       'COUNTRY_SL', 'COUNTRY_WI', 'COUNTRY_ZIM', 'PLAYING ROLE_Batsman',
       'PLAYING ROLE_Bowler', 'PLAYING ROLE_W. Keeper', 'CAPTAINCY EXP_1'],
      dtype='object')

In [15]:
X_features = ipl_auction_encoded_df.columns

### Splitting the dataset into train and validation set

In [20]:
X = sm.add_constant(ipl_auction_encoded_df)
Y = ipl_auction_df['SOLD PRICE']

train_X, test_X, train_y, test_y = train_test_split(X,
                                                    Y,
                                                    train_size = 0.8,
                                                    random_state = 42)

In [26]:
train_X.dtypes

const                     float64
Sl.NO.                      int64
PLAYER NAME                object
TEAM                       object
T-RUNS                      int64
T-WKTS                      int64
ODI-RUNS-S                  int64
ODI-SR-B                  float64
ODI-WKTS                    int64
ODI-SR-BL                 float64
RUNS-S                      int64
HS                          int64
AVE                       float64
SR-B                      float64
SIXERS                      int64
RUNS-C                      int64
WKTS                        int64
AVE-BL                    float64
ECON                      float64
SR-BL                     float64
AUCTION YEAR                int64
BASE PRICE                  int64
SOLD PRICE                  int64
AGE_2                        bool
AGE_3                        bool
COUNTRY_BAN                  bool
COUNTRY_ENG                  bool
COUNTRY_IND                  bool
COUNTRY_NZ                   bool
COUNTRY_PAK   

In [32]:
train_X = train_X.astype(float)
train_y = train_y.astype(float)


### Variance Inflation factor

In [45]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def get_vif_factors(X):
    X_matrix = X.values
    vif = [ variance_inflation_factor(X_matrix, i) for i in range (X_matrix.shape[1])]
    vif_factors = pd.dataFrame()
    vif_factors['columns'] = X.columns
    vif_factors['vif'] = vif

    return vif_factors

In [47]:
vif_factors = get_vif_factors(X[X_features])
vif_factors

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''