- Subseting and Sorting Data
- Grouping Data
- Data Types
- Handling Missing Data
- Duplicates
- Outliers

In [None]:
# loading the package
import pandas as pd

In [None]:
file = r"C:\Users\Gideon\Downloads\afcon-2023-final-squads-list.csv"

In [None]:
# load the data
df = pd.read_csv(file)

In [None]:
df.head(30)

- Subseting and Sorting Data

In [None]:
# take a part of the dataframe: rows, columns
# iloc, loc, dot, square brackets

# df.iloc[rows, columns]
df.iloc[0:27, :]

In [None]:
df.iloc[0:27, [1, 4, 5]]

In [None]:
df.loc[0:27, ['team', 'club_country']]

In [None]:
print(df.shape)
df.info()

In [None]:
df.describe(include='all')

In [None]:
from datetime import datetime
df['birthdate'] = pd.to_datetime(df['birthdate'], errors='coerce') #format="%Y-%m.%d")#1994-07-25

df['age'] = (datetime.now() - df['birthdate']).dt.days//365

In [None]:
df.head()

In [None]:
# count number of defenders
df[df.position == 'DF']

- Grouping Data

In [None]:
# Aggregate Data, Apply Functions, Data Exploration
# groupby

In [None]:
df.groupby('team')['age'].max().sort_values(ascending = False)

In [None]:
df.groupby('team')['age'].min().sort_values(ascending = True)

In [None]:
df[df.age == 40]

In [None]:
df.age.min()

In [None]:
df[df.age == df.age.min()]

- Data Types

In [None]:
df.info()

In [None]:
# handling missing values

# isna() isnull() notnull()

df.isna().sum()

In [None]:
# for those players without a club. 
df.club.fillna('Free Agents', inplace=True)

after executing this line of code, any missing values in the 'club' column of the DataFrame df will be replaced with 'Free Agents', and the changes will be applied directly to df

In [None]:
df.isna().sum()

In [None]:
df.club_country.fillna('No Clubs', inplace=True)

In [None]:
df.isnull().sum()

In [1]:
# duplicate values

df[df.duplicated(subset='jersey_name')]

NameError: name 'df' is not defined

In [None]:
df[df.duplicated(subset='birthdate')]

In [None]:
df[df.birthdate == '1998-03-13']

In [None]:
df['club'] = df['club'].str.strip()

In [None]:
df[df.birthdate == '1998-03-13']

In [None]:
df.age.describe()

- Variance: Variance is the average of the squared differences from the Mean. It gives an idea of how much the data values differ from the mean. Mathematically, it's calculated as follows:

  - Calculate the mean (average) of the dataset.
   - Subtract the mean from each data point, square the result.
    - Find the average of those squared differences.

In [None]:
age_list = df.age.tolist()

In [None]:
df.age.mean()

In [None]:
age_list

In [None]:
age_no_var = [29, 29, 29, 29, 29, 29, 29, 29, 29]
import numpy as np
data = np.array([age_no_var])
no_var = np.var(data)
print(no_var)

In [None]:
data = np.array([age_list])
var = np.var(data)
print(var)

- What is standard deviation?

    A measure of how dispersed the data is in relation to the mean
    - Step 1: Find the mean.
    - Step 2: For each data point, find the square of its distance to the mean.
    - Step 3: Sum the values from Step 2.
    - Step 4: Divide by the number of data points.
    - Step 5: Take the square root
    

In [None]:
df.age.std()

In [None]:
df.age.hist(bins=10)

-  1 Standard Deviation from the Mean: 68 percent.
-  2 Standard Deviations from the Mean: 95 percent.
-  3 Standard Deviations from the Mean: 99.7 percent.

In [None]:
# outliers 

mean = round(df.age.mean(), 2)
std = round(df.age.std(), 2)

In [None]:
print(mean)
print(std)

In [None]:
one_std = (mean+std, mean-std)

In [None]:
one_std

In [None]:
two_std = (mean+(std*2), mean-(std*2))

In [None]:
two_std

In [None]:
three_std = (mean+(std*3), mean-(std*3))

In [None]:
three_std

In [None]:
def remove_outliers(df, column, threshold=3):
    mean = df[column].mean()
    std_dev = df[column].std()
    
    lower_bound = mean - threshold * std_dev #PEMDAS
    upper_bound = mean + threshold * std_dev
    # remove outliers
    
    df = df[(df[column]>= lower_bound) & (df[column]<= upper_bound)]
    return df


In [None]:
df_cleaned = remove_outliers(df, 'age', 2)

In [None]:
df_cleaned.shape

In [None]:
removed_rows = df[~df.index.isin(df_cleaned.index)]

In [None]:
removed_rows

In [None]:
# Treating outliers using the IQR (interquartile range)
import pandas as pd
path = r"C:\Users\Gideon\Desktop\winequality-red.csv"
wine_data = pd.read_csv(path)

wine_data.head()

In [None]:
wine_data.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# checking for outliers
plt.figure(figsize= (18, 20)) # set figure size
for i in range(len(wine_data.columns)):
    plt.subplot(4, 3, i+1) # creates a subplot
    sns.boxplot(x = wine_data.iloc[:, i])
    plt.xlabel(wine_data.columns[i], size=15)

In [None]:
print(wine_data['sulphates'].quantile(0.25))
print(wine_data['sulphates'].quantile(0.75))

In [None]:
wine_data['sulphates'].plot(kind='density')

In [None]:
def Outliers(df, col):
    IQ1 = df[col].quantile(0.25)
    IQ3 = df[col].quantile(0.75)
    IQR = IQ3 - IQ1
    
    lower_bound = IQ1 - 1.5 * IQR
    upper_bound = IQ3 + 1.5 * IQR
    index = df.index[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return index

In [None]:
wine_data.index

In [None]:
wine_data.shape

In [None]:
index = []
for i in wine_data.columns:
    index.extend(Outliers(wine_data, i))
index = set(index)
#print(index)
print(f'Total number of outliers are {len(index)}')

# dropping all the outliers
wine_data.drop(index, inplace=True, axis=0)
wine_data.shape

In [None]:
wine_data.columns

In [None]:
!pip install scipy

In [None]:
# 3. using the z-score

from scipy import stats
z_scores = stats.zscore(wine_data['chlorides'])

In [None]:
print(z_scores)

In [None]:
threshold = 0.5

df_filtered = wine_data['chlorides'][(z_scores < threshold)]

In [None]:
df_filtered

In [None]:
plt.figure(figsize= (18, 20)) # set figure size
for i in range(len(wine_data.columns)):
    plt.subplot(4, 3, i+1) # creates a subplot
    sns.kdeplot(x = wine_data.iloc[:, i])
    plt.xlabel(wine_data.columns[i], size=15)