In [1]:
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen
from bs4 import BeautifulSoup
import urllib, requests, mechanicalsoup
import http.cookiejar as cookielib
from bs4 import BeautifulSoup
import time, traceback, pickle
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import NearestNeighbors
from copy import deepcopy

In [2]:
def sparq_from_row(row):
    #Calculate the "sparq" score for a given row of data
    spq = 181.924227 + -4.592111*row["Height"]/30.48 + 0.361056*row["Weight"] - 15.159259*row["40 Yard Dash"] - 45.532307*row["10 Yard Split"] - 5.985102*row["3-Cone Drill"] - 18.731278*row["20 Yard Shuttle"] + 0.252199*row["Bench Press"] + 9.557272*row["Broad Jump"]/30.48 + 1.327290*row["Vertical Jump"]/2.54
    return spq

def height_string_to_cm(string):
    #Convert a height to centimeters and deal with fractions
    ft = int(string.split("'")[0])
    inches = float(string.split()[1].replace("*", "").replace('"', "").replace("⅛", ".125").replace("¼", ".25").replace("⅜", ".375").replace("½", ".5").replace("⅝",".625").replace("¾", ".75").replace("⅞", ".875"))
    cm = (12*ft + inches) * 2.54
    return cm

def length_to_in(string):
    #Convert length to inches and deal with fractions
    length = float(string.replace("*", "").replace('"', "").replace("⅛", ".125").replace("¼", ".25").replace("⅜", ".375").replace("½", ".5").replace("⅝",".625").replace("¾", ".75").replace("⅞", ".875"))
    return length


In [3]:
stime = time.time()

#Define which parameters to keep and which need to be converted
measurables = ["height", "weight", "wingspan", "arms", "hands", "10yd", "40yd", "bench", "vertical", "broad", "3cone", "20ss"]
needs_conversion = ["wingspan", "arms", "hands", "vertical", "broad"]

#Some of the names on the websites are wrong, so here's a dict of common
#errors and their corrections
corrections = {
    "r.jay soward": "r. jay soward",
    "johnathan holland":"jonathan holland",
    "terrance toliver":"terrence toliver",
    "odell beckham jr." : "odell beckham, jr.",
    "mike campanaro":"michael campanaro",
    "t.j.  jones" : "t.j. jones",
    "chris  godwin":"chris godwin",
    "dante pettis":"dante pettis "
}

#To keep things consistent and readable, we'll use these standard keys for combine measurements
key_change = {
        "40yd":"40 Yard Dash",
        "height":"Height",
        "weight":"Weight",
        "vertical":"Vertical Jump",
        "bench":"Bench Press",
        "broad":"Broad Jump",
        "3cone":"3-Cone Drill",
        "20ss":"20 Yard Shuttle",
        "arms":"Arm Length",
        "hands":"Hand Size",
        "10yd":"10 Yard Split",
        "wingspan":"Wingspan"
}

#We could create an empty dataframe and concat new data to it, but it's faster
#to create a dict, append new data to that, then make a dataframe from that dict later.
#So let's do that.
combines = {}

#Loop through mockdraftable's website to get combine measurements
for measurable in measurables:
    print("Working on", measurable, time.ctime().split()[-2])
    alt_measurable = key_change[measurable]
    
    #There are fewer than 400 pages for each measurement, but let's be safe
    for i in range(1,400):
        #Open the url for each page needed. If that page does not exist
        #(i.e. we've finished that measureable), it'll throw a ValueError, so let's catch that.
        try:
            url = "https://www.mockdraftable.com/search?position=ATH&beginYear=1999&endYear=2019&sort=DESC&page="+str(i)+"&measurable=" + measurable
            html = urlopen(url)
            soup = BeautifulSoup(html, 'lxml')
            #Get all the data from the html
            for n, h in zip(soup.find_all("h5"), soup.find_all("figure")):
                name = n.text.lower()
                
                #Try to correct names
                if name in corrections.keys():
                    name = corrections[name]
                
                #If the player's new, make a new dict entry for them
                if not name in combines.keys():
                    combines[name] = {}
                
                #We need the measurement to have been taken
                if not h.text == "?":
                    #Get the value and convert it if needed
                    if measurable == "height":
                        combines[name][alt_measurable] = height_string_to_cm(h.text)
                    elif measurable in needs_conversion:
                        val_string = h.text.replace("*","").replace("lbs","").replace("reps","").replace("s","")
                        combines[name][alt_measurable] = length_to_in(val_string)
                    else:
                        combines[name][alt_measurable] = float(h.text.replace("*","").replace("lbs","").replace("reps","").replace("s",""))
        except ValueError:
            break

print("Done in %0.2f minutes" %((time.time()-stime)/60))

Working on height 09:24:15


KeyboardInterrupt: 

In [4]:
def predictions_with_threshold(probs,thresh = 0.6):
    #Make a list of predictions where the threshold is given by the user rather
    #than using the standard sklearn threshold of 0.5.
    predictions_with_threshold = []
    for p in probs:
        if p[1] >= thresh:
            predictions_with_threshold.append(1)
        else:
            predictions_with_threshold.append(0)
    return np.asarray(predictions_with_threshold)

In [5]:
def plot_confusion_matrix(test, pred):
    #Given test data and predictions of that data, generate a confusion matrix that
    #looks good.
    labels = np.asarray(['Below Average', 'Above Average'])
    cm = confusion_matrix(labels[test], labels[pred], labels)
    fig = plt.figure(figsize = [10,10])
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    plt.title('Confusion Matrix of the Classifier', fontsize = 25)
    cbar = fig.colorbar(cax)
    cbar.ax.tick_params(labelsize=15) 
    ax.set_xticklabels([''] + list(labels), fontsize = 18)
    ax.set_yticklabels([''] + list(labels), fontsize = 18)
    ax.xaxis.set_ticks_position("bottom")
    plt.xlabel('Predicted', fontsize = 25)
    plt.ylabel('True', fontsize = 25)
    plt.tight_layout()
    plt.savefig("football_confusion_matrix.png")
    plt.show()

In [6]:
def grid_search(model, params, features, targets, cv = 3, scoring = "precision", verbose = True, plot=False):
    #Perform a grid search and print out useful information about the search
    if verbose:
        stime = time.time()
        print("Started", time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime()))
    
    #Do the actual grid search
    clf = GridSearchCV(model, param_grid = params, scoring = scoring, cv = cv)
    clf.fit(features, targets)
    
    #Get the names of the parameters searched over.
    par_names = list(clf.cv_results_["params"][0].keys())
    
    #Print the parameters of the best classifier.
    print("Best params:")
    for key, val in clf.best_params_.items():
        print(key + ": ", val)
    
    #But the best classifier might be ties with 2nd best or very close to it, so let's
    #look at the pars of all the classifiers sorted on performance.
    sorted_pars_scores = [(x,y) for x,y in sorted(zip(clf.cv_results_["mean_test_score"],\
                                                      range(len(clf.cv_results_["mean_test_score"]))),\
                                                  reverse=True)]
    
    #Assuming a classifier that is searching for Nestimators and learning rates, let's plot
    #aa diagnostic heatmap of those parameters
    n_ests = []
    learning_rates = []
    scores = []
    
    #If verbose, just print out everything
    if verbose:
        print()
        print(" ".join(par_names + [scoring]))
        for score, iter_num in sorted_pars_scores:
            pars = clf.cv_results_["params"][iter_num]
            par_vals = [pars[n] for n in par_names]
            learning_rates.append(par_vals[0])
            n_ests.append(par_vals[1])
            scores.append(100*score)
            print("%0.2f %d %0.2f" % (*par_vals, 100*score))
            #print(" ".join([str(val) for val in par_vals + [score]]))
        print('Finished in', (time.time()-stime)/60, "minutes")
        
    #If plot, make and plot the heatmap described above
    if plot:
        plt.scatter(learning_rates, n_ests, c=scores)
        plt.xlabel("Learning Rate")
        plt.ylabel("N_Estimators")
        plt.colorbar()
        plt.show()
    
    return clf

In [7]:
def get_ROC(thresholds, probs, labels):
    #To determine the best threshold for prediction, generate a ROC curve and return it.
    true_false_positives = []
    for thresh in thresholds:
        predictions = (probs > thresh)*1
        n_false_positives = sum((predictions - labels) == 1)
        n_true_positives = sum(((predictions - labels) == 0) & (predictions == 1))
        true_false_positives.append([n_true_positives, n_false_positives])
    
    return np.asarray(true_false_positives)

In [10]:
def get_features_targets():
    #Read in the data scraped from PFR and FO.
    pklfile = "names_dvoas.pkl"
    with open(pklfile, "rb") as f:
        names,pro_dvoas = pickle.load(f)
    
    pklfile = "players.pkl"
    with open(pklfile, "rb") as f:
        players = pickle.load(f)[0]
    
    pklfile = "mockdraftable_stats.pkl"
    with open(pklfile, "rb") as f:
        combines = pickle.load(f)[0]
    
    #Convert to dataframes
    df = pd.DataFrame.from_dict(players, orient = "index")
    df["name"] = df["name"].str.lower()
    combines_df = pd.DataFrame.from_dict(combines, orient = "index")
    backup = [names.copy(), pro_dvoas.copy()]
    
    #Get WRs
    wrs = df[df["position"] == "WR"]
    
    #We'll assume stats with values = 0 means missing data, so change 0.00 to nan.
    wrs = wrs.where(wrs!=0.00, np.nan)
    wrs = wrs.replace("Pac-10", "Pac-12")
    
    #Make a new feature for conference played in 
    wrs["Power 5"] = wrs["conference"].isin(["ACC", "Big Ten", "Big 12", "SEC", "Pac12"])**2
    wrs = wrs.fillna({"rush_yds":0})

    #Drop duplicate names
    duplicate_names = ["mike williams", "cedrick wilson", "steve smith"]
    wrs = wrs.drop_duplicates(subset = ["name"], keep="first")
    
    combine_features = ["Height", "Weight", "40 Yard Dash", "Vertical Jump", "Broad Jump", "3-Cone Drill",\
                        "20 Yard Shuttle", "Bench Press", 'Arm Length', 'Hand Size', '10 Yard Split', 'Wingspan']
    
    #Now we need to fill any NaN's in the PFR combine data with mockdraftable data
    updated_wrs = wrs.join(combines_df, on = "name", rsuffix = "_mockdraftable")
    for feat in combine_features:
        updated_wrs[feat].fillna(updated_wrs[feat + "_mockdraftable"])
        del updated_wrs[feat + "_mockdraftable"]
    
    
    #Next, we're going to deal with remaining missing data by just replacing those values
    #with the average value of the stat for a given year. It's a bit messy, but we'll
    #do the first iteration outside of a loop to initiate the right dataframes.
    old_wr_features = ["name", "draft_year", "Height", "Weight", "40 Yard Dash", "Vertical Jump", "Broad Jump", "3-Cone Drill", "20 Yard Shuttle", "Bench Press", "years_played", "games_played", "rec", "rec_yds", "rec_yds_per_rec", "rec_td", 'Arm Length', 'Hand Size', '10 Yard Split', 'Wingspan']
    wr_features = ["Weight", "40 Yard Dash", "Vertical Jump", "Broad Jump", "20 Yard Shuttle", "years_played", "rec", "rec_yds", "rec_yds_per_rec", "rec_td",'10yd', 'arms', 'hands','wingspan']
    cleaned_wrs = updated_wrs[updated_wrs["draft_year"] == 2000]
    bad_cols = np.asarray(old_wr_features)[cleaned_wrs[old_wr_features].count()<10]
    fillna = cleaned_wrs[old_wr_features].mean()
    fillna[bad_cols] = updated_wrs[old_wr_features].mean()[bad_cols]
    cleaned_wrs = cleaned_wrs.fillna({i:fillna[i] for i in fillna.index})
    
    #Then do the rest in a loop.
    for year in range(2001,2019):
        this_year = wrs[wrs["draft_year"] == year]
        bad_cols = np.asarray(old_wr_features)[this_year[old_wr_features].count()<10]
        fillna = this_year[old_wr_features].mean()
        fillna[bad_cols] = wrs[old_wr_features].mean()[bad_cols]
        this_year = this_year.fillna({i:fillna[i] for i in fillna.index})
        cleaned_wrs = pd.concat([cleaned_wrs, this_year])
    
    #Some values from before are in inches. Convert them to cm.
    cleaned_wrs.loc[cleaned_wrs["Broad Jump"] < 200, "Broad Jump"] *= 2.54
    cleaned_wrs.loc[cleaned_wrs["Vertical Jump"] <50, "Vertical Jump"] *= 2.54
    
    features = cleaned_wrs

    #Gets all data that have college stats. Makes dvoa = 0 if never in pros.
    i=0
    names = [n.lower() for n in backup[0]]
    dvoas = backup[1].copy()
    college_names = [x.lower() for x in cleaned_wrs["name"].to_list()]
    final_names = []
    played_a_game = {name:1 for name, dvoa in zip(names,pro_dvoas)}
    target_dict = {name:dvoa for name, dvoa in zip(names,pro_dvoas)}
    
    #Names is a list of players who have played in the pros.
    #If a player isn't in that list, set the target and played_a_game to values
    #that reflect that
    for college_name in college_names:
        if not college_name in names:
            names.append(college_name)
            dvoas.append(-2000)
            target_dict[college_name] = -2000
            played_a_game[college_name] = 0
        else:
            played_a_game[college_name] = 1
    
    #This removes any pro player that we somehow don't have college stats for
    for key in target_dict.keys():
        if not key in college_names:
            _ = target_dict.pop(k)
            _ = played_a_game.pop(k)

    #Just making sure they're in the same order
    final_played_a_game = {}
    final_targets = {}
    for name in college_names:
        final_targets[name] = target_dict[name]
        final_played_a_game[name] = played_a_game[name]
    
    #Now make the final targets and return the necessary DFs/data.
    targets = []
    played_a_game = []
    for name in cleaned_wrs["name"]:
        targets.append(final_targets[name.lower()])
        played_a_game.append(final_played_a_game[name.lower()])

    played_a_game = np.asarray(played_a_game)
    targets = np.asarray(targets)
    #This cuts out players who didn't play in the NFL:
    #features = features[targets != -2000]
    #targets = targets[targets != -2000]
    return cleaned_wrs, features, targets, played_a_game, out_df


In [None]:
cleaned_wrs, features, targets, played_a_game, df = get_features_targets()

#The "targets" from above is the player's DVOA.
cleaned_wrs["DVOA"] = targets

In [None]:
#These are the features we'll use from the combine
combine_features = ['Height', 'Weight', '40 Yard Dash', 'Vertical Jump', 'Broad Jump', 'Arm Length']

#Combine those with relevant college stats.
df_subset = cleaned_wrs[combine_features+ ["rec", "rec_yds", "rec_td"]]

#Since we tried to take care of any nulls, any remaining nulls can't be
#saved, so drop them
df_mask = ~df_subset.isnull().any(axis=1)
pure_df = cleaned_wrs[df_mask]

#Weirdly, Kenny Clark makes the code barf, so just remove him.
pure_df = pure_df[(pure_df["name"] != "kenny clark").values]

#Scale the data. We have to do this before we add features that are one-hot encoded,
#like whether or not a player played for a power 5 conference
scaler = StandardScaler()
scaler.fit(pure_df[combine_features + ["rec", "rec_yds", "rec_td"]])

#Scale the data and put into a dataframe
scaled_features = pd.DataFrame(scaler.transform(pure_df[combine_features + ["rec", "rec_yds", "rec_td"]]),\
                               columns=combine_features + ["rec", "rec_yds", "rec_td"],\
                              index = pure_df.index)

#Now insert categorical feature
scaled_features.insert(5, "Power 5", pure_df["Power 5"].values)

#We might want to model how far a player's stats are from the mean stats, so
#we'll add those features to give us the option of using them later
for feat in combine_features:
    scaled_features[feat + "_dist"] = (scaled_features[feat]**2)**0.5

#Before we oversample, we need to hold out a dataset
hold_out_features = scaled_features.sample(frac=0.25, random_state=90, replace = False)
hold_out_targets = pure_df.loc[hold_out_features.index]["DVOA"]
work_features = scaled_features.drop(hold_out_features.index)
work_targets = pure_df.drop(hold_out_features.index)
work_targets = work_targets["DVOA"]

print(len(hold_out_targets), len(work_targets))
print(sum(hold_out_targets > 0), len(hold_out_targets))

#Let's oversample successes since the dataset is unbalanced. Note that this
#needs to be done with replacement.
successes = work_targets[work_targets > 0]
failures = work_targets[work_targets < 0]
oversampled_successes = successes.sample(len(failures), replace = True)

#Save this data as a balanced dataset 
balanced_targets = pd.concat([oversampled_successes, failures])
balanced_features = scaled_features.loc[balanced_targets.index]
print(sum(balanced_targets > 0), len(balanced_targets))
print(len(balanced_targets[balanced_targets > 0])/len(balanced_targets))


In [None]:
#Rather than try to regress on DVOA, we'll lump players into those with
#positive and negative DVOAs. THat's how we'll train our classifier.
player_types = np.zeros_like(balanced_targets)
player_types[balanced_targets > 0] =1
player_types = player_types.astype(int)

#Create a list of features depending on whether or not the user wants to use the
#_dist features described above or not.
dist_setting = "no dist"
#dist_setting = "only dist"
#dist_setting = "both"


if dist_setting == "both":
    fit_features = ['Power 5','Height', 'Weight',
       '40 Yard Dash', 'Vertical Jump', 'Broad Jump',
       'Arm Length', 'rec', 'rec_yds', "rec_td", 'Height_dist', 'Weight_dist',
       '40 Yard Dash_dist', 'Vertical Jump_dist', 'Broad Jump_dist',
       'Arm Length_dist']
elif dist_setting == "no dist":
    fit_features = ['Power 5','Height', 'Weight',
           '40 Yard Dash', 'Vertical Jump', 'Broad Jump',
           'Arm Length', 'rec', 'rec_yds', "rec_td"]
elif dist_setting == "only dist":
    fit_features = ['Power 5', 'rec', 'rec_yds', "rec_td", 'Height_dist', 'Weight_dist',
       '40 Yard Dash_dist', 'Vertical Jump_dist', 'Broad Jump_dist',
       'Arm Length_dist']
else:
    raise NameError("Fit feature list not defined")

#Now fit. We'll use different datasets to get a better grasp on the actual validation metrics of the model.
#We'll also keep track of a bunch of diagnostic info.
as_list = []
precision_list = []
good_as_bad = []
bad_as_good = []
total_good = []
thresh = 0.95
total_predicted = []
for i in range(50):
    #Split data into test/train sets
    features_train, features_test, player_types_train, player_types_test = train_test_split(balanced_features[fit_features],\
                                                                                            player_types, test_size=0.33)
    
    #GBC works best for this data
    clf = GradientBoostingClassifier(n_estimators=350, learning_rate=0.47)
    clf.fit(features_train, player_types_train)
    
    #Get prediction probs
    pred_probs = clf.predict_proba(features_test)
    pred = clf.predict(features_test)
    
    #Get accuracy, precision, and confusion matrix
    as_list.append(accuracy_score(predictions_with_threshold(pred_probs, thresh = thresh), player_types_test))
    precision_list.append(precision_score(predictions_with_threshold(pred_probs, thresh = thresh), player_types_test))
    cm = confusion_matrix(predictions_with_threshold(pred_probs, thresh = thresh), player_types_test)
    
    #Get interesting stats from the confusion matrix
    total_good.append(sum(player_types_test))
    total_predicted.append(sum(predictions_with_threshold(pred_probs, thresh = thresh), player_types_test))
    good_as_bad.append(cm[0,1])
    bad_as_good.append(cm[1,0])
    
#Print out useful metrics
final_preds = predictions_with_threshold(pred_probs, thresh = thresh)
print("Model accuracy:  %0.1f%s %0.1f" % (np.average(as_list)*100, "% +/-", np.std(as_list)*100))
print("Model precision: %0.1f%s %0.1f" % (np.average(precision_list)*100, "% +/-", np.std(precision_list)*100))

print("Number of WRs:                       ", len(player_types_test))
print("Number of good WRs predicted:        ", np.average(total_predicted))
print("True number of good WRs:             ", np.average(total_good))
print("Number of good WRs classified as bad:", np.average(good_as_bad))
print("Number of bad WRs classified as good:", np.average(bad_as_good))

#Plot the confusion matrix
plot_confusion_matrix(player_types_test, final_preds)


In [None]:
#Let's sort and plot the importances of all the features
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
plot_labels = [x.replace("rec_td", "Touchdowns").replace("rec_yds", "Yards").replace("rec","Receptions") for x in fit_features ]
n_features = len(indices)
fig = plt.figure(figsize = [15,10])
ax = fig.add_subplot(111)
ax.bar(range(n_features), importances[indices], align="center")
plt.xticks(range(n_features), indices, rotation = 45, fontsize=20)
plt.yticks(fontsize=20)
ax.set_xticklabels(plot_labels)
ax.xaxis.set_ticks_position("bottom")
plt.title("Classifier Feature Importances", fontsize=20)
plt.ylabel("Normalized Importance", fontsize=20)
plt.tight_layout()
plt.savefig("feature_importances.png")
plt.show()

print("Feature ranking:")

for f in range(n_features):
    print("%d. %s (%0.2f)" % (f + 1, fit_features[f], importances[indices[f]]))



In [None]:
#Now let's make predictions about the most recent draft class. This is what I'll use for the demo.
stime = time.time()


#Note that this will be messy since it's web scraping. Also it's really similar to the other scraping,
#so I won't bother commenting this well.
measurables = ["height", "weight", "wingspan", "arms", "hands", "40yd", "bench", "vertical", "broad", "3cone", "20ss"]
needs_conversion = ["wingspan", "arms", "hands", "vertical", "broad"]

corrections = {
    "r.jay soward": "r. jay soward",
    "johnathan holland":"jonathan holland",
    "terrance toliver":"terrence toliver",
    "odell beckham jr." : "odell beckham, jr.",
    "mike campanaro":"michael campanaro",
    "t.j.  jones" : "t.j. jones",
    "chris  godwin":"chris godwin",
    "dante pettis":"dante pettis "
}

key_change = {
        "40yd":"40 Yard Dash",
        "height":"Height",
        "weight":"Weight",
        "vertical":"Vertical Jump",
        "bench":"Bench Press",
        "broad":"Broad Jump",
        "3cone":"3-Cone Drill",
        "20ss":"20 Yard Shuttle",
        "arms":"Arm Length",
        "hands":"Hand Size",
        "10yd":"10 Yard Split",
        "wingspan":"Wingspan"
}

unknown_combines_dict = {}

for measurable in measurables:
    print("Working on", measurable, time.ctime().split()[-2])
    alt_measurable = key_change[measurable]
    j=1
    for i in range(1,50):
        try:
            url = "https://www.mockdraftable.com/search?position=ATH&beginYear=2019&endYear=2019&sort=DESC&page="+str(i)+"&measurable=" + measurable
            html = urlopen(url)
            soup = BeautifulSoup(html, 'lxml')
            for n, h in zip(soup.find_all("h5"), soup.find_all("figure")):
                name = n.text.lower()
                pos = soup.find_all("span")[2*j].text
                if name in corrections.keys():
                    name = corrections[name]
                if not name in unknown_combines_dict.keys():
                    unknown_combines_dict[name] = {"pos":pos}
                if not h.text == "?":
                    if measurable == "height":
                        unknown_combines_dict[name][alt_measurable] = height_string_to_cm(h.text)
                    elif measurable in needs_conversion:
                        val_string = h.text.replace("*","").replace("lbs","").replace("reps","").replace("s","")
                        unknown_combines_dict[name][alt_measurable] = length_to_in(val_string)
                    else:
                        unknown_combines_dict[name][alt_measurable] = float(h.text.replace("*","").replace("lbs","").replace("reps","").replace("s",""))
                j+=1
        except ValueError:
            break
        
        j=1


print("Done in %0.2f minutes" %((time.time()-stime)/60))
bkp = deepcopy(unknown_combines_dict)

In [None]:
stime = time.time()
unknown_combines = pd.DataFrame.from_dict(unknown_combines_dict, orient="index")
unknown_combines = unknown_combines[unknown_combines["pos"] == "WR"]
for name in unknown_combines.index.values:
    useful_name = "-".join(name.replace(".","").replace("'","").split())
    url = "https://www.sports-reference.com/cfb/players/" + useful_name + "-1.html"
    if name == "felton davis":
        url = "https://www.sports-reference.com/cfb/players/felton-davis-iii-1.html"
    try:
        html = urlopen(url)
        soup = BeautifulSoup(html, 'lxml')
        unknown_combines_dict[name]["years_played"] = len(soup.find_all('tr'))-3
        unknown_combines_dict[name]['school_name'] = soup.find_all('tr')[-2].find_all('td')[0].text
        unknown_combines_dict[name]['position'] = soup.find_all('tr')[-2].find_all('td')[3].text
        unknown_combines_dict[name]['conference'] = soup.find_all('tr')[-2].find_all('td')[1].text
    except Exception as e:
        print(url)
        continue
    gp = 0
    for row in soup.find_all('tr')[2:-1]:
        if row.find_all('th')[0].text.lower() == "career":
            break
        else:
            grad_year = int(row.find_all('th')[0].text.replace("*",""))
        for col in row.find_all('td'):
            if col['data-stat'] == 'g':
                if col.text == '':
                    pass
                else:
                    gp += int(col.text)
    unknown_combines_dict[name]['games_played'] = gp
    unknown_combines_dict[name]['grad_year'] = grad_year
    
    # We only want the *career* stats for the college_player, so we only want to look at the
    # last line of the table. That's why we're only looking at soup.find_all('tr')[-1].
    # On that last line, we want skip the first column, since it's always a string that we
    # don't care about, and trying to convert it to a float will cause an error. So we skip it
    # by doing ".find_all('td')[1:]". That's actually showing us the columns of that row.
    # We'll call each column "stat", since it contains a statistic.
    for stat in soup.find_all('tr')[-1].find_all('td')[1:]:
        # In the table, missing stats are just empty strings, so if we find an empty string,
        # we'll just make it a 0.
        if stat.text == '':
            num = 0
        # Otherwise convert the value to a float.
        else:
            num = float(stat.text)
        
        # Each column is a dictionary object, where "data-stat" returns the name of the column
        # (so like passing yards, tackles, and so on), and the "text" is the actual value.
        # So we're making a dictionary of our own where the keys are the name of the stat,
        # and the value for that key is the actual value of the stat.
        unknown_combines_dict[name][stat['data-stat']] = num


unknown_combines = pd.DataFrame.from_dict(unknown_combines_dict, orient="index")
unknown_combines = pd.concat([unknown_combines[(unknown_combines["pos"] == 0)], unknown_combines[unknown_combines["pos"] == "WR"]])
unknown_combines["Power 5"] = unknown_combines["conference"].isin(["ACC", "Big Ten", "Big 12", "SEC", "Pac12"])**2
print("Done in %0.2f minutes" %((time.time()-stime)/60))

In [None]:
#We'll do the same type of filtering/scaling we did for the training data.
unknown_combines_subset = unknown_combines[combine_features+ ["rec", "rec_yds", "rec_td"]]
unknown_combines_mask = ~unknown_combines_subset.isnull().any(axis=1)
pure_unknown_combines = unknown_combines[unknown_combines_mask.values]
scaled_unknown_combines = pd.DataFrame(scaler.transform(pure_unknown_combines[combine_features + ["rec", "rec_yds", "rec_td"]]), columns=combine_features + ["rec", "rec_yds", "rec_td"])
for feat in combine_features:
    scaled_unknown_combines[feat + "_dist"] = (scaled_features[feat]**2)**0.5

scaled_unknown_combines.insert(5, "Power 5", pure_unknown_combines["Power 5"].values)



In [None]:
#Now we actually do the prediction and look at the results.
success_probs = [x[1] for x in clf.predict_proba(scaled_unknown_combines[fit_features])]
names = pure_unknown_combines.index
pred_list = [[y,x] for y,x in sorted(zip(success_probs,names), reverse = True)]

for p in pred_list:
    pr,n = p
    print("['%s', %0.1f]" % (n.title(),pr))
    
