# Vanessa Williams
# Milestone 2

### Part 1: Replace Headers

In [142]:
import pandas as pd

# Load the CSV file
file_path = '2019_Scoring_Offense.csv'
data = pd.read_csv(file_path)

# Replace headers as requested
data.rename(columns={'Rk': 'Rank', 'Tm': 'Teams'}, inplace=True)

# Display the updated dataframe to verify
print(data.head())  # Verify that the headers are updated

   Rank                 Teams     G  RshTD  RecTD  PR TD  KR TD  FblTD  IntTD  \
0   1.0  New England Patriots  16.0   17.0   25.0    NaN    NaN    2.0    3.0   
1   2.0         Buffalo Bills  16.0   13.0   21.0    NaN    1.0    NaN    NaN   
2   3.0      Baltimore Ravens  16.0   21.0   37.0    NaN    NaN    4.0    2.0   
3   4.0         Chicago Bears  16.0    8.0   20.0    NaN    1.0    NaN    1.0   
4   5.0     Minnesota Vikings  16.0   19.0   26.0    NaN    NaN    1.0    1.0   

   OthTD  ...  2PM  2PA  D2P   XPM   XPA   FGM   FGA  Sfty    Pts  Pts/G  
0    2.0  ...  2.0  4.0  NaN  39.0  45.0  27.0  34.0   1.0  420.0   26.3  
1    NaN  ...  3.0  3.0  NaN  30.0  32.0  22.0  28.0   1.0  314.0   19.6  
2    NaN  ...  2.0  5.0  NaN  57.0  59.0  28.0  29.0   1.0  531.0   33.2  
3    NaN  ...  1.0  1.0  NaN  27.0  29.0  23.0  28.0   1.0  280.0   17.5  
4    NaN  ...  1.0  3.0  NaN  40.0  44.0  27.0  29.0   1.0  407.0   25.4  

[5 rows x 21 columns]


### Part 2: Format data into a more readable format

In [148]:
# Replace NaN values with zero
data.fillna(0, inplace=True)
data.head()

Unnamed: 0,Rank,Teams,G,RshTD,RecTD,PR TD,KR TD,FblTD,IntTD,OthTD,...,2PM,2PA,D2P,XPM,XPA,FGM,FGA,Sfty,Pts,Pts/G
0,1.0,New England Patriots,16.0,17.0,25.0,0.0,0.0,2.0,3.0,2.0,...,2.0,4.0,0.0,39.0,45.0,27.0,34.0,1.0,420.0,26.3
1,2.0,Buffalo Bills,16.0,13.0,21.0,0.0,1.0,0.0,0.0,0.0,...,3.0,3.0,0.0,30.0,32.0,22.0,28.0,1.0,314.0,19.6
2,3.0,Baltimore Ravens,16.0,21.0,37.0,0.0,0.0,4.0,2.0,0.0,...,2.0,5.0,0.0,57.0,59.0,28.0,29.0,1.0,531.0,33.2
3,4.0,Chicago Bears,16.0,8.0,20.0,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,27.0,29.0,23.0,28.0,1.0,280.0,17.5
4,5.0,Minnesota Vikings,16.0,19.0,26.0,0.0,0.0,1.0,1.0,0.0,...,1.0,3.0,0.0,40.0,44.0,27.0,29.0,1.0,407.0,25.4


### Part 3: Identify outliers and bad data

In [152]:
# Function to detect outliers using IQR
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Select only numerical columns for outlier detection
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

# Detect outliers for each numerical column and store them
outliers_dict = {}
for column in numerical_columns:
    outliers = detect_outliers_iqr(data, column)
    if not outliers.empty:
        outliers_dict[column] = outliers

# Display the outliers found
for column, outliers in outliers_dict.items():
    print(f"Outliers in {column}:")
    print(outliers)
    print("\n")

# Remove rows with "League Total" or "Avg Tm/G" to clean the data
data_cleaned = data[~data['Teams'].isin(['League Total', 'Avg Tm/G'])]

# Save the cleaned data 
data_cleaned.to_csv('updated_2019_Scoring_Offense.csv', index=False)


Outliers in G:
    Rank         Teams    G   RshTD   RecTD  PR TD  KR TD  FblTD  IntTD  \
32   0.0      Avg Team  0.0   14.00   24.90   0.20   0.20   1.10   1.10   
33   0.0  League Total  0.0  447.00  797.00   7.00   7.00  34.00  35.00   
34   0.0      Avg Tm/G  0.0    0.87    1.56   0.01   0.01   0.07   0.07   

    OthTD  ...    2PM     2PA  D2P      XPM      XPA     FGM     FGA   Sfty  \
32   0.20  ...   1.70    3.50  0.1    35.50    37.80   25.10   30.70   0.50   
33   5.00  ...  54.00  113.00  2.0  1136.00  1210.00  802.00  983.00  17.00   
34   0.01  ...   0.11    0.22  0.0     2.22     2.36    1.57    1.92   0.03   

        Pts  Pts/G  
32    365.0   22.8  
33  11680.0   22.8  
34     22.8    0.0  

[3 rows x 21 columns]


Outliers in RshTD:
    Rank         Teams    G  RshTD  RecTD  PR TD  KR TD  FblTD  IntTD  OthTD  \
33   0.0  League Total  0.0  447.0  797.0    7.0    7.0   34.0   35.0    5.0   

    ...   2PM    2PA  D2P     XPM     XPA    FGM    FGA  Sfty      Pts  Pts/G 

### Problem 4: Find duplicates

In [154]:
# Find duplicate rows in the dataset
duplicates = data[data.duplicated()]

# Display the duplicates found
print("Duplicate rows:")
print(duplicates)

Duplicate rows:
Empty DataFrame
Columns: [Rank, Teams, G, RshTD, RecTD, PR TD, KR TD, FblTD, IntTD, OthTD, AllTD, 2PM, 2PA, D2P, XPM, XPA, FGM, FGA, Sfty, Pts, Pts/G]
Index: []

[0 rows x 21 columns]


### Problem 5: Fix casing or inconsistent values

In [156]:
# Strip any leading or trailing whitespaces from all string columns
data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Convert all string columns to consistent casing 
data = data.apply(lambda x: x.str.title() if x.dtype == "object" else x)

# Display the updated dataframe to check changes
data.head()

Unnamed: 0,Rank,Teams,G,RshTD,RecTD,PR TD,KR TD,FblTD,IntTD,OthTD,...,2PM,2PA,D2P,XPM,XPA,FGM,FGA,Sfty,Pts,Pts/G
0,1.0,New England Patriots,16.0,17.0,25.0,0.0,0.0,2.0,3.0,2.0,...,2.0,4.0,0.0,39.0,45.0,27.0,34.0,1.0,420.0,26.3
1,2.0,Buffalo Bills,16.0,13.0,21.0,0.0,1.0,0.0,0.0,0.0,...,3.0,3.0,0.0,30.0,32.0,22.0,28.0,1.0,314.0,19.6
2,3.0,Baltimore Ravens,16.0,21.0,37.0,0.0,0.0,4.0,2.0,0.0,...,2.0,5.0,0.0,57.0,59.0,28.0,29.0,1.0,531.0,33.2
3,4.0,Chicago Bears,16.0,8.0,20.0,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,27.0,29.0,23.0,28.0,1.0,280.0,17.5
4,5.0,Minnesota Vikings,16.0,19.0,26.0,0.0,0.0,1.0,1.0,0.0,...,1.0,3.0,0.0,40.0,44.0,27.0,29.0,1.0,407.0,25.4


In [160]:
data.columns

Index(['Rank', 'Teams', 'G', 'RshTD', 'RecTD', 'PR TD', 'KR TD', 'FblTD',
       'IntTD', 'OthTD', 'AllTD', '2PM', '2PA', 'D2P', 'XPM', 'XPA', 'FGM',
       'FGA', 'Sfty', 'Pts', 'Pts/G'],
      dtype='object')

### Part 7: Add data

In [169]:
data['TotalTD'] = data[['RshTD', 'RecTD', 'PR TD', 'KR TD', 'FblTD', 'IntTD', 'OthTD']].sum(axis=1)
print("\nPart 7: Add a new column (Total Touchdowns)\n", data[['Teams', 'TotalTD']].head())


Part 7: Add a new column (Total Touchdowns)
                   Teams  TotalTD
0  New England Patriots     49.0
1         Buffalo Bills     35.0
2      Baltimore Ravens     64.0
3         Chicago Bears     30.0
4     Minnesota Vikings     47.0
