In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('new_data/clean-data.csv')

data.head()

Unnamed: 0,playerID,year,stint,tmID,GP,oRebounds,dRebounds,dq,PostMinutes,PostPoints,...,defensive_discipline,mpg,pos,college,playoff,confID,playoff_progression,height,weight,award_count
0,0,2,0,555,26,1.65,5.04,0.08,0.0,0.0,...,11.92,32.54,2,575,0,701,0,74.0,169,0
1,0,3,0,555,27,1.67,3.74,0.0,0.0,0.0,...,12.22,29.81,2,575,0,701,0,74.0,169,0
2,0,4,0,555,30,1.47,3.23,0.0,23.0,7.67,...,11.26,26.4,2,575,1,701,1,74.0,169,0
3,0,5,0,555,22,0.77,2.59,0.0,33.5,10.0,...,7.72,21.0,2,575,1,701,1,74.0,169,0
4,0,6,0,555,31,0.94,2.52,0.0,0.0,0.0,...,10.7,25.06,2,575,0,701,0,74.0,169,0


In [3]:
with open('stats.txt', 'w') as file:
    for col in data.columns:
        if type(data[col][0]) == str: 
            continue
        file.write("-"*50 + "\n")
        file.write(f"Statistics of {col}\n")
        file.write("-"*50 + "\n")
        # data[col].plot.box();  # You can enable this if you want to create a plot

        q3, q1 = np.percentile(data[col], [75 ,25])

        file.write(f"Min: {data[col].min()}\n")
        file.write(f"Max: {data[col].max()}\n")
        file.write(f"Mean: {data[col].mean()}\n")
        file.write(f"Median: {data[col].median()}\n")
        file.write(f"Standard deviation: {data[col].std()}\n")
        file.write(f"Skewness: {data[col].skew()}\n")
        file.write(f"Variance: {data[col].var()}\n")
        file.write(f"Kurtosis: {data[col].kurtosis()}\n")
        file.write(f"Interquartile Range: {q3 - q1} | Q1: {q1} | Q3: {q3}\n")

In [4]:
outliers_std = {}
all_outliers = []  # Array to store all outlier rows

#columns to skip
not_skip_cols = ['height', 'weight']

for col in data.columns:
    if type(data[col][0]) == str or col not in not_skip_cols: 
        continue
    data_mean, data_std = data[col].mean(), data[col].std()
    # identify outliers
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    outliers = pd.concat([data.loc[data[col] < lower], data.loc[data[col] > upper]])
    outliers_std[col] = outliers
    all_outliers.extend(outliers.values.tolist())  # Add outliers to the array

for key in outliers_std.keys():
    if len(outliers_std[key]) > 0:
        print("-"*50)
        print(f"Outliers of {key} - mean: {data[key].mean()} - std: {data[key].std()}")
        print("-"*50)
        print(outliers_std[key][key])

# Printing all outlier rows
print("All outlier rows:")
for outlier in all_outliers:
    print(outlier)

--------------------------------------------------
Outliers of height - mean: 71.7361407249467 - std: 5.523399419866893
--------------------------------------------------
417    9.0
418    9.0
419    9.0
420    9.0
421    9.0
422    9.0
423    9.0
424    9.0
425    9.0
Name: height, dtype: float64
--------------------------------------------------
Outliers of weight - mean: 166.99786780383795 - std: 24.62425894894399
--------------------------------------------------
109       0
362       0
552       0
1070      0
1406      0
1407      0
1500      0
1223    250
1721    253
Name: weight, dtype: int64
All outlier rows:
[132.0, 1.0, 0.0, 557.0, 32.0, 0.91, 4.56, 0.03, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.8, 0.44, 0.14, 1.0, 0.0, 0.0, 0.0, 0.0, 0.9, 1.04, 9.19, 5.47, 1.59, 0.56, 3.0, -214.19, 13.68, 81.19999999999999, 12.24, 24.22, 3.0, 582.0, 0.0, 701.0, 0.0, 9.0, 223.0, 0.0]
[132.0, 2.0, 0.0, 557.0, 32.0, 0.91, 6.69, 0.03, 34.5, 14.0, 0.5, 6.5, 7.0, 1.5, 0.5, 3.5

In [5]:
outliers_ids_std = set([val for key in outliers_std.keys() for val in outliers_std[key]['playerID']])
outlier_ratio = len(outliers_ids_std) / len(data)
outlier_ratio

0.004797441364605544

In [6]:
outliers_z = {}
for col in data.columns:
    if type(data[col][0]) == str or col not in not_skip_cols: 
        continue
    z_val = (data[col] - data[col].mean()) / data[col].std()
    outliers_z[col] = data.loc[(z_val < -4) | (z_val > 4)]

for key in outliers_z.keys():
    if (len(outliers_z[key]) > 0):
        print("-"*50)
        print(f"Outliers of {key} - mean: {data[key].mean()} - std: {data[key].std()}")
        print("-"*50)
        print(outliers_z[key][key])

In [None]:
outliers_ids_z = set([val for key in outliers_z.keys() for val in outliers_z[key]['playerID']])
outlier_ratio = len(outliers_ids_z) / len(data)
outlier_ratio

0.0037313432835820895

In [None]:
outliers_ids = outliers_ids_std.union(outliers_ids_z)
print(f"Total outliers: {len(outliers_ids)}")
print(f"Total outliers ratio: {len(outliers_ids) / len(data)}")
print(outliers_ids)
#create a new df with outliers removed
manual_outlier_df = data[~data['playerID'].isin(outliers_ids)]

manual_outlier_df.duplicated(subset=['playerID', 'year', 'stint']).sum()

Total outliers: 9
Total outliers ratio: 0.004797441364605544
{'parisco01w', 'millebr01w', 'walkema01w', 'berezva01w', 'sanniol01w', 'davista02w', 'givench01w', 'dydekma01w', 'smithty01w'}


0

In [None]:
manual_outlier_df.to_csv('new_data/clean-data_without_outliers.csv', index=False)
# manual_outlier_df.to_csv('output/loan_dev_transformed_without_outliers.csv', index=False)