# Beyong Descriptive Analysis
### Tianjiao Yang

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [2]:
athlete_events= pd.read_csv('/home/jovyan/work/athlete_events.csv', error_bad_lines=False, engine="python")
region=pd.read_csv('/home/jovyan/work/noc_regions.csv')

Skipping line 109836: unexpected end of data


## Exploring the Relationship between Athletes' Physical Conditions and Performance Achievements
Conduct a comprehensive analysis on the association between athletes' physical attributes, including age, gender, height, weight, and the outcome of medal acquisition

In [3]:
# subset selection
subdf=pysqldf("select id, sex, age, height, weight, medal from athlete_events;")

In [4]:
# eliminate nulls except for medal
subdf_c=pysqldf("select * from subdf where sex is not null and age is not null and height is not null and weight is not null; ")

In [5]:
subdf_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82253 entries, 0 to 82252
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      82253 non-null  int64  
 1   Sex     82253 non-null  object 
 2   Age     82253 non-null  float64
 3   Height  82253 non-null  float64
 4   Weight  82253 non-null  float64
 5   Medal   11585 non-null  object 
dtypes: float64(3), int64(1), object(2)
memory usage: 3.8+ MB


In [6]:
# convert sex values to integer, male: 2, female: 1
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
subdf_c['Sex_c'] = label_encoder.fit_transform(subdf_c['Sex'])+1

In [7]:
subdf_c

Unnamed: 0,ID,Sex,Age,Height,Weight,Medal,Sex_c
0,1,M,24.0,180.0,80.0,,2
1,2,M,23.0,170.0,60.0,,2
2,5,F,21.0,185.0,82.0,,1
3,5,F,21.0,185.0,82.0,,1
4,5,F,25.0,185.0,82.0,,1
...,...,...,...,...,...,...,...
82248,55526,M,24.0,180.0,62.0,,2
82249,55526,M,28.0,180.0,62.0,,2
82250,55527,M,27.0,175.0,61.0,,2
82251,55533,M,29.0,194.0,80.0,,2


In [8]:
subdf_c['Medal_c']=pysqldf('''
                                select 
                                case when medal is not null then 1
                                     else 0
                                end as medal_c
                                from subdf_c;
                                ''')
 

In [9]:
subdf_c

Unnamed: 0,ID,Sex,Age,Height,Weight,Medal,Sex_c,Medal_c
0,1,M,24.0,180.0,80.0,,2,0
1,2,M,23.0,170.0,60.0,,2,0
2,5,F,21.0,185.0,82.0,,1,0
3,5,F,21.0,185.0,82.0,,1,0
4,5,F,25.0,185.0,82.0,,1,0
...,...,...,...,...,...,...,...,...
82248,55526,M,24.0,180.0,62.0,,2,0
82249,55526,M,28.0,180.0,62.0,,2,0
82250,55527,M,27.0,175.0,61.0,,2,0
82251,55533,M,29.0,194.0,80.0,,2,0


In [10]:
# Import packages for logistic regression and accuracy analysis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [11]:
X=subdf_c.drop(['ID','Sex','Medal','Medal_c'], axis=1)
y=subdf_c['Medal_c']

In [12]:
X_train,X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=30)

In [13]:
logreg=LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
y_pred=logreg.predict(X_test)

In [15]:
# Get the coefficients (or weights) of the independent variables
coefficients = logreg.coef_

# Print the coefficients
for i, feature in enumerate(X_train.columns):
    print("Coefficient for", feature, ":", coefficients[0][i])

Coefficient for Age : 0.0072365847637659905
Coefficient for Height : 0.026043306112177218
Coefficient for Weight : 0.011216255297431754
Coefficient for Sex_c : -0.6242313010099496


In [16]:
# check p-values of each variable to see their significant levels
import statsmodels.api as sm
# Add a constant term to the independent variables (required for logistic regression)
X_train_1 = sm.add_constant(X_train)
# Create and fit the logistic regression model
logreg1 = sm.Logit(y_train, X_train_1)
result=logreg1.fit()

Optimization terminated successfully.
         Current function value: 0.398012
         Iterations 6


In [17]:
# Get the summary of the logistic regression model
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                Medal_c   No. Observations:                65802
Model:                          Logit   Df Residuals:                    65797
Method:                           MLE   Df Model:                            4
Date:                Fri, 28 Jul 2023   Pseudo R-squ.:                 0.01942
Time:                        03:43:38   Log-Likelihood:                -26190.
converged:                       True   LL-Null:                       -26709.
Covariance Type:            nonrobust   LLR p-value:                3.156e-223
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.3546      0.250    -25.450      0.000      -6.844      -5.865
Age            0.0072      0.002      3.459      0.001       0.003       0.011
Height         0.0260      0.002     15.028      0.0

## Conclusion
1. The age, height, weight, and gender of athletes play a crucial role in predicting medal-winning performances.
2. Age, height, and weight exhibit a positive correlation with medal-winning outcomes, which is understandable given that advancing age often accompanies increased strength, skills, and experience. Similarly, height and weight positively impact attributes such as strength, speed, and other physiological responses.
3. Interestingly, gender displays an inverse relationship with medal-winning success. This finding corroborates our previous analysis, which suggested that female athletes appear to have a higher likelihood of achieving medal-worthy performances compared to their male counterparts.