In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [3]:
train=pd.read_csv("C:/HackerEarthML/Dataset/train.csv")

In [4]:
test=pd.read_csv("C:/HackerEarthML/Dataset/test.csv")

In [5]:
train.head()

Unnamed: 0,Won_Championship,Previous_SB_Wins,Number_Of_Wins_This_Season,Number_Of_First_Round_Draft_Picks,Team_Value,Playing_Style,Average_Player_Age,Number_Of_Injured_Players,Coach_Experience_Level,ID
0,0,3,13,2,Less_Than_Four_Billion,Balanced,27,five,Intermediate,6056
1,0,2,14,2,Less_Than_Four_Billion,Aggressive_Offense,26,five,Intermediate,9702
2,1,2,13,1,Less_Than_Four_Billion,Aggressive_Defense,27,four,Intermediate,1745
3,0,2,12,2,Above_Four_Billion,Balanced,27,six,Intermediate,4757
4,0,1,15,2,Less_Than_Four_Billion,Balanced,26,five,Intermediate,7242


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 10 columns):
Won_Championship                     6500 non-null int64
Previous_SB_Wins                     6500 non-null int64
Number_Of_Wins_This_Season           6500 non-null int64
Number_Of_First_Round_Draft_Picks    6500 non-null int64
Team_Value                           6500 non-null object
Playing_Style                        6500 non-null object
Average_Player_Age                   6500 non-null int64
Number_Of_Injured_Players            6500 non-null object
Coach_Experience_Level               6500 non-null object
ID                                   6500 non-null int64
dtypes: int64(6), object(4)
memory usage: 507.9+ KB


### Team_Value

In [7]:
train.Team_Value.describe()

count                       6500
unique                         3
top       Less_Than_Four_Billion
freq                        5130
Name: Team_Value, dtype: object

In [8]:
train.Team_Value.unique()

array(['Less_Than_Four_Billion', 'Above_Four_Billion',
       'Less_Than_Three_Billion'], dtype=object)

In [9]:
mapper={'Less_Than_Four_Billion':2,'Above_Four_Billion':3, 'Less_Than_Three_Billion':1}

In [10]:
train.Team_Value=train.Team_Value.replace(mapper)

In [11]:
train.Team_Value.unique()

array([2, 3, 1], dtype=int64)

### Playing_Style

In [12]:
train.Playing_Style.describe()

count                   6500
unique                     4
top       Aggressive_Defense
freq                    3204
Name: Playing_Style, dtype: object

In [13]:
train.Playing_Style.unique()

array(['Balanced', 'Aggressive_Offense', 'Aggressive_Defense', 'Relaxed'],
      dtype=object)

### Number_Of_Injured_Players

In [14]:
train.Number_Of_Injured_Players.unique()

array(['five', 'four', 'six', 'three', 'seven', 'eight', 'two', 'nine',
       'one', 'ten'], dtype=object)

In [15]:
mapper2={'five':5, 'four':4, 'six':6, 'three':3, 'seven':7, 'eight':8, 'two':2, 'nine':9,'one':1, 'ten':10}

In [16]:
train.Number_Of_Injured_Players=train.Number_Of_Injured_Players.replace(mapper2)

In [17]:
train.Number_Of_Injured_Players.describe()

count    6500.000000
mean        5.425692
std         1.310832
min         1.000000
25%         5.000000
50%         5.000000
75%         6.000000
max        10.000000
Name: Number_Of_Injured_Players, dtype: float64

### Coach_Experience_Level

In [18]:
train.Coach_Experience_Level.unique()

array(['Intermediate', 'Beginner', 'Advanced'], dtype=object)

In [19]:
mapper3={'Intermediate':2, 'Beginner':1, 'Advanced':3}

In [20]:
train.Coach_Experience_Level=train.Coach_Experience_Level.replace(mapper3)

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 10 columns):
Won_Championship                     6500 non-null int64
Previous_SB_Wins                     6500 non-null int64
Number_Of_Wins_This_Season           6500 non-null int64
Number_Of_First_Round_Draft_Picks    6500 non-null int64
Team_Value                           6500 non-null int64
Playing_Style                        6500 non-null object
Average_Player_Age                   6500 non-null int64
Number_Of_Injured_Players            6500 non-null int64
Coach_Experience_Level               6500 non-null int64
ID                                   6500 non-null int64
dtypes: int64(9), object(1)
memory usage: 507.9+ KB


In [22]:
train.isnull().sum()

Won_Championship                     0
Previous_SB_Wins                     0
Number_Of_Wins_This_Season           0
Number_Of_First_Round_Draft_Picks    0
Team_Value                           0
Playing_Style                        0
Average_Player_Age                   0
Number_Of_Injured_Players            0
Coach_Experience_Level               0
ID                                   0
dtype: int64

In [23]:
train.columns

Index(['Won_Championship', 'Previous_SB_Wins', 'Number_Of_Wins_This_Season',
       'Number_Of_First_Round_Draft_Picks', 'Team_Value', 'Playing_Style',
       'Average_Player_Age', 'Number_Of_Injured_Players',
       'Coach_Experience_Level', 'ID'],
      dtype='object')

### Feature Selection

In [24]:
features=['Previous_SB_Wins', 'Number_Of_Wins_This_Season',
       'Number_Of_First_Round_Draft_Picks', 'Team_Value', 'Playing_Style',
       'Average_Player_Age', 'Number_Of_Injured_Players',
       'Coach_Experience_Level']

In [25]:
train_df=train[features]

In [26]:
train_df.head()

Unnamed: 0,Previous_SB_Wins,Number_Of_Wins_This_Season,Number_Of_First_Round_Draft_Picks,Team_Value,Playing_Style,Average_Player_Age,Number_Of_Injured_Players,Coach_Experience_Level
0,3,13,2,2,Balanced,27,5,2
1,2,14,2,2,Aggressive_Offense,26,5,2
2,2,13,1,2,Aggressive_Defense,27,4,2
3,2,12,2,3,Balanced,27,6,2
4,1,15,2,2,Balanced,26,5,2


In [27]:
#train_df.var().sort_values()

In [28]:
#train_df.drop(["Coach_Experience_Level","Team_Value"],axis=1,inplace=True)

In [29]:
train_df.shape

(6500, 8)

In [30]:
train_df=pd.get_dummies(train_df,drop_first=True) #getting rid of categorical values

In [31]:
train_df.shape

(6500, 10)

In [32]:
new_features=['Number_Of_Wins_This_Season',
       'Number_Of_First_Round_Draft_Picks', 'Average_Player_Age', 'Number_Of_Injured_Players'] #new features

In [33]:
new_df=train_df[new_features]

### defining X_train and Y_train

In [34]:
X_train=new_df.values

In [35]:
Y_train=train.Won_Championship.values

### Models used

In [36]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
#reg=LogisticRegression()
#reg=CatBoostClassifier(iterations=2, learning_rate=1,depth=2)
ran=RandomForestClassifier(random_state=1)
#reg=Perceptron(random_state=1)
#reg=KNeighborsClassifier()
XGB=XGBClassifier()
svc=SVC()

In [37]:
params_rf = {'n_estimators': [50, 100, 200]}

#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(ran, params_rf, cv=5)

#fit model to training data
rf_gs.fit(X_train, Y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [50, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [38]:
rf_best = rf_gs.best_estimator_

#check best n_estimators value
print(rf_gs.best_params_)

{'n_estimators': 50}


In [39]:
log_reg = LogisticRegression()

#fit the model to the training data
log_reg.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [40]:
#ran=ran.fit(X_train,Y_train)
XGB=XGB.fit(X_train,Y_train)
svc=svc.fit(X_train,Y_train)



In [41]:
#reg.feature_importances_

In [42]:
test.head()

Unnamed: 0,Previous_SB_Wins,Number_Of_Wins_This_Season,Number_Of_First_Round_Draft_Picks,Team_Value,Playing_Style,Average_Player_Age,Number_Of_Injured_Players,Coach_Experience_Level,ID
0,3,16,2,Above_Four_Billion,Relaxed,26,two,Intermediate,1
1,2,15,3,Less_Than_Four_Billion,Balanced,26,six,Advanced,5
2,2,10,1,Less_Than_Four_Billion,Balanced,27,nine,Beginner,6
3,3,14,3,Less_Than_Four_Billion,Balanced,26,seven,Intermediate,7
4,3,13,2,Less_Than_Four_Billion,Balanced,27,three,Intermediate,9


In [43]:
test_df=test[features]

In [44]:
test_df.Team_Value=test_df.Team_Value.replace(mapper)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [45]:
test_df.Number_Of_Injured_Players=test_df.Number_Of_Injured_Players.replace(mapper2)

In [46]:
test_df.Coach_Experience_Level=test_df.Coach_Experience_Level.replace(mapper3)

In [47]:
#test_df.drop(["Coach_Experience_Level","Team_Value"],axis=1,inplace=True)

In [48]:
test_df=pd.get_dummies(test_df,drop_first=True)

In [49]:
new_test=test_df[new_features].values

In [50]:
new_test.shape

(3500, 4)

In [51]:
new_test

array([[16,  2, 26,  2],
       [15,  3, 26,  6],
       [10,  1, 27,  9],
       ...,
       [13,  2, 27,  7],
       [15,  2, 26,  4],
       [15,  2, 27,  3]], dtype=int64)

### using an ensemble technique to combine more than 1 first level models

In [52]:
from sklearn.ensemble import VotingClassifier
new_model=VotingClassifier(estimators=[('XGB',XGB),('ran',rf_best), ('log_reg', log_reg)], voting='soft',weights=[1,3,1]) 

In [53]:
new_model.fit(X_train,Y_train)



VotingClassifier(estimators=[('XGB', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objectiv...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='soft',
         weights=[1, 3, 1])

In [54]:
Y_pred=new_model.predict(new_test)

In [55]:
Y_pred

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [56]:
data={"ID":test.ID,"Won_Championship":Y_pred}

In [57]:
sub=pd.DataFrame(data)

In [58]:
sub.to_csv("HackEarSubRF+XGB+LogReg.csv",index=False)