# Determine Winning Nation: Deep Learning Neural Network

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from tensorflow.keras.utils import to_categorical

FileNotFoundError: Could not find module 'C:\Users\carly\anaconda3\envs\mlenv\lib\site-packages\scipy\.libs\libbanded5x.65HX7FOLJUAC36SYG7MTJQTRYELCEJQH.gfortran-win_amd64.dll' (or one of its dependencies). Try using the full path with constructor syntax.

In [None]:
#import csv
df = pd.read_csv('data/nations_final.csv')
print(df.shape)
df.head()

In [None]:
df.nunique()

In [None]:
NOC_count = df['NOC'].value_counts()
print(NOC_count)
NOC_count.plot.density()

In [None]:
medals = df.groupby(['NOC', 'Team'])[['Total Medals', 'Golds', 'Silvers', 'Bronzes']].sum()
medals = medals.sort_values(by="Total Medals", ascending=False).reset_index()
medals.head(20)

In [None]:
total_medal_count = df['Total Medals'].sum()
medals['% Medals Won'] = (medals["Total Medals"]/total_medal_count * 100).round(3)
top_nations = medals.sort_values(by="% Medals Won", ascending=False)
print(f"Top 20 Medaling Nations: Percentage of All Medals Won")
print(f"{round(sum(medals['% Medals Won'].head(20)),3)} %")


top_nations.head(20)

## Top 20 Nations: Count of Medal Type Won

In [None]:
plt.figure(figsize=(10,5))
x = top_nations.head(20)['NOC'].tolist()
y1 = top_nations.head(20)['Golds'].tolist()
y2 = top_nations.head(20)['Silvers'].tolist()
y3 = top_nations.head(20)['Bronzes'].tolist()
plt.bar(x, y1, color='g')
plt.bar(x, y2, color='b')
plt.bar(x, y3, color='r')
plt.xlabel("Top 20 Nations")
plt.ylabel('Medal Count')
plt.legend(['Gold', 'Silver', 'Bronze'])
plt.title("Total Medals Won by Rank")
plt.xticks(x, visible=True, rotation=45)

y_min = 30
y_max = 1300
plt.ylim([y_min, y_max])
plt.show()

In [None]:
# Worst Nations - Any nation not in top 20 with % medals won less than #20 rank and have at least won a medal historically
worst_nations = top_nations[top_nations["% Medals Won"]<1.326]
print(f"Next 20 Medaling Nations: Percentage of All Medals Won")
print(f"{round(sum(worst_nations['% Medals Won'].head(20)),3)} %")
worst_nations

In [None]:
print(len(worst_nations['NOC'].unique()))

## Top 40 Nations : Total Medal Counts 

In [None]:
plt.figure(figsize=(10,5))
x_top = top_nations.head(20)['NOC'].tolist()
y_top = top_nations.head(20)['Total Medals'].tolist()
y_bottom = worst_nations.head(20)['Total Medals'].tolist()

plt.bar(x, y_top, color='r')
plt.bar(x, y_bottom, color='b')
plt.xlabel('Top 20 Nations', fontsize=15)
plt.ylabel('Total Medal Count')
plt.title('Top 20 Nations vs Next 20 Nations : Total Medal Count')
plt.xticks(x_top, visible=True, rotation=45)
plt.legend(['Top 20', 'Next 20'])
plt.savefig("images/top40.png")
plt.show()

In [None]:
keep = list(top_nations['NOC'].head(20).unique())
keep

In [None]:
print(len(replace_noc))

In [None]:
# Reduce number of unique naiton classifiers to 21 - Top 20 Medaling Nations and "Other"
keep = list(top_nations['NOC'].head(20).unique())

# Create a copy of the dataframe for iterable purposes
df2 = df.copy()

# Replace Nation in dataframe
df2.loc[~df2['NOC'].isin(keep), 'NOC'] = "Other"

# Check to make sure binning was successful
print(len(df2.NOC.value_counts()))
df2.NOC.value_counts()

In [None]:
# Double check the dataframe
df2.sample(n=5)

In [None]:
# Set features for model
drop_features = ['NOC','Team','Games', 'Year']
# Split our preprocessed data into our features and target arrays
X = df2
X = X.drop(drop_features, axis=1).values
y = df2['NOC'].values
print(X.shape, y.shape)

In [None]:
# Split data into training and testing groups and scale data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Step (1): Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step (2): Convert encoded labels using to_categorical()
y_train_cat = to_categorical(encoded_y_train, 21)
y_test_cat = to_categorical(encoded_y_test, 21)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 50
hidden_nodes_layer2 = 25

# Define the model
nn= Sequential()

# First hidden layer
nn.add(Dense(units=hidden_nodes_layer1, activation="relu", input_dim=number_input_features))

# Second hidden layer
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer- softmax =generalization of the logistic function to multiple dimensions.
nn.add(Dense(units=21, activation="softmax"))


In [None]:
# summarize the model
nn.summary()

In [None]:
# Compile and fit the model
nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Fit the model to the training data
nn.fit(X_train_scaled, y_train_cat, epochs=100, shuffle=True, verbose=2)

# Evaluate model using training data
model_loss, model_accuracy = nn.evaluate(X_train_scaled,y_train_cat,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test_cat, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
prediction = np.argmax(nn.predict(X_test_scaled), axis = -1)
predicted_labels = label_encoder.inverse_transform(prediction)
print(f"Predicted Labels: {predicted_labels[:10]}")
print(f"Actual Labels: {list(y_test[:10])}")

# Only Predict Top 20 Nations - Filter out "Other"

In [None]:
df2.head()

In [None]:
df3 = df2[df2['NOC']!="Other"]
print(df3.shape)
df3.head()

In [None]:
df3['NOC'].unique()

In [None]:
# Set features for model
drop_features = ['NOC','Team','Games', 'Year']
# Split our preprocessed data into our features and target arrays
X = df3
X = X.drop(drop_features, axis=1).values
y = df3['NOC'].values
print(X.shape, y.shape)


In [None]:
# Split data into training and testing groups and scale data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Step (1): Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step (2): Convert encoded labels using to_categorical()
y_train_cat = to_categorical(encoded_y_train, 20)
y_test_cat = to_categorical(encoded_y_test, 20)

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 50
hidden_nodes_layer2 = 25

# Define the model
nn= Sequential()

# First hidden layer
nn.add(Dense(units=hidden_nodes_layer1, activation="relu", input_dim=number_input_features))

# Second hidden layer
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer- softmax =generalization of the logistic function to multiple dimensions.
nn.add(Dense(units=20, activation="softmax"))

# Compile and fit the model
nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model to the training data
nn.fit(X_train_scaled, y_train_cat, epochs=100, shuffle=True, verbose=2)


In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test_cat, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
prediction = np.argmax(nn.predict(X_test_scaled), axis = -1)
predicted_labels = label_encoder.inverse_transform(prediction)
print(f"Predicted Labels: {predicted_labels[:10]}")
print(f"Actual Labels: {list(y_test[:10])}")