In [None]:
import pandas as pd

# Set max columns to display
pd.set_option('display.max_columns', 500)

## 1. Data Analysis (25 points)
- Exploratory Analysis (10 points): Derive conclusions and determine the dataset's characteristics, including missing values, outliers, and statistical summaries.
- Visualizations (5 points): Provide insightful visualizations to understand correlations and patterns.
- Data Quality (10 points): Identify and handle missing values, anomalies, and duplicate entries.

In [None]:
# Read dataset (https://market.oceanprotocol.com/asset/did:op:6fd1ff2e2d59b89c610b2bf72f3f19b3fccb79864b019b63041a19b9010c9a5c)
nfl_data = pd.read_csv('../data/nflpy.csv', index_col=0)
print(nfl_data.shape)
nfl_data.head()

### Exploratory Analysis

In [None]:
# Missing Values Analysis: Identify columns with missing values and their impact.

# Analyzing missing values in each column
missing_values = nfl_data.isnull().sum().sort_values(ascending=False)

# Percentage of missing values for each column
missing_percentage = (nfl_data.isnull().sum() / nfl_data.shape[0] * 100).sort_values(ascending=False)

# Combine the missing values count and percentage into a DataFrame
missing_info = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})

missing_info[missing_info['Missing Values'] > 0]


In [None]:
# Look are records where draft_number is null
nfl_data[nfl_data['draft_number'].isnull()].head()

In [None]:
# Look at records where player_id is 00-0019596
nfl_data[nfl_data['player_id'] == '00-0019596']

In [None]:
nfl_data = pd.read_csv('../data/nflpy.csv', index_col=0)

# Sort the DataFrame by player_id and season
nfl_data.sort_values(['player_id', 'season'], inplace=True)

# Fill missing values for the draft_number for each player_id with the max value for the player_id
nfl_data['draft_number'] = nfl_data.groupby('player_id')['draft_number'].transform(lambda x: x.fillna(x.max()))

# Fill missing values for years_exp and age for each player_id based on the value from the previous season
nfl_data['years_exp'] = nfl_data.groupby('player_id')['years_exp'].transform(lambda x: x.fillna(method='ffill'))
nfl_data['years_exp_filled'] = nfl_data.groupby('player_id')['years_exp'].transform(lambda x: x.diff().eq(0))
nfl_data.loc[nfl_data['years_exp_filled'], 'years_exp'] += 1
nfl_data['age'] = nfl_data.groupby('player_id')['age'].transform(lambda x: x.fillna(method='ffill'))
nfl_data['age_filled'] = nfl_data.groupby('player_id')['age'].transform(lambda x: x.diff().eq(0))
nfl_data.loc[nfl_data['age_filled'], 'age'] += 1
nfl_data.drop(['years_exp_filled','age_filled'], axis=1, inplace=True)

# Fill missing values for name and bmi for each player_id with the previous value
nfl_data['bmi'] = nfl_data.groupby('player_id')['bmi'].transform(lambda x: x.fillna(method='ffill'))
nfl_data['name'] = nfl_data.groupby('player_id')['name'].transform(lambda x: x.fillna(method='ffill'))

# Look at player_id is 00-0019596 again
nfl_data[nfl_data['player_id'] == '00-0019596']

In [None]:
# Drop remaining records where years_exp, age, or draft_number is missing b/c not sure what the value should be
nfl_data.dropna(subset=['years_exp', 'age', 'draft_number'], inplace=True)

# Drop records where team is missing
nfl_data.dropna(subset=['team'], inplace=True)

In [None]:
# Re-run missing values analysis
missing_values = nfl_data.isnull().sum().sort_values(ascending=False)
missing_percentage = (nfl_data.isnull().sum() / nfl_data.shape[0] * 100).sort_values(ascending=False)
missing_info = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
missing_info[missing_info['Missing Values'] > 0]

In [None]:
nfl_data[["fantasy_points_ppr"]].head()

In [None]:
# Statistical Summaries: Generate summary statistics for numerical variables.


In [None]:
# Outliers Analysis: Look for any obvious outliers in key metrics.


In [None]:
# Data Distribution: Visualize the distribution of key variables.


## 2. Feature Engineering (25 points)
- Feature Selection (10 points): Identify and justify the inclusion or exclusion of features for your model
- Feature Transformation (10 points): Apply transformations and encoding to optimize model performance, and explain your process
- Feature Creation (5 points): Create new metric(s) that can be used as an additional feature to enhance predictive accuracy.

## 3. Model Build (15 points)
- Model Selection (5 points): Justify the choice of algorithms and methodologies of why you chose your model.
- Model Training (10 points): Detail the training process, including hyperparameter tuning and cross-validation. You may choose to simply submit screenshots of your notebook. However, those accompanied by written explanations will score higher.

## 4. Model Performance (25 points)
- Evaluation (10 points): Evaluate the model using metrics like RMSE, Mean absolute error (MAE), R-squared, or equivalent on a validation set.
- Interpretability (10 points): Explain the model's decisions and significant features.
- Reproducibility (5 points): Ensure that the model and preprocessing steps are reproducible.