---
---
The scikit-learn bible:
https://scikit-learn.org/stable/user_guide.html

---
---

In [None]:
~

### Dataframe
- Two-dimensional array-like object
- axis 0 --> rows
- axis 1 --> cols

In [121]:
import pandas as pd
import numpy as np

# Missing data are marked by a sentinel value expressed by the na_values parameter (default: NA, NULL or empty space); they are replaced with NaN values in the loaded DataFrame
df = pd.read_csv("datasets/nba.csv")

### Info

In [122]:
# visualize count, mean, quartiles, max, ...
df.describe()

Unnamed: 0,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,...,AST,STL,BLK,PTS,birth_year,height_cm,weight,weight_kg,draft_round,draft_pick
count,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,...,53949.0,53949.0,53949.0,53949.0,53631.0,53875.0,49385.0,49385.0,10136.0,10136.0
mean,30.313574,752.431404,113.200541,245.094942,28.468535,80.738383,56.297299,76.012716,47.260487,70.10104,...,62.78691,26.715398,10.492057,311.178372,1986.361675,197.445123,210.309527,95.422193,1.38753,14.053177
std,17.849616,534.216679,100.164033,212.155076,30.673395,80.672208,59.24065,76.172698,37.398461,45.62977,...,73.184287,22.077459,18.199867,271.81159,6.637023,8.728587,26.128059,11.851299,0.508224,8.643064
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1961.0,160.0,130.0,59.0,1.0,1.0
25%,17.0,380.9,48.0,109.0,6.0,20.0,20.0,28.0,21.0,37.0,...,20.0,11.0,1.0,134.0,1982.0,191.0,190.0,86.0,1.0,6.0
50%,29.0,663.0,89.0,196.0,20.0,61.0,40.0,56.0,39.0,64.0,...,41.0,21.0,4.0,247.0,1987.0,198.0,209.0,95.0,1.0,13.0
75%,37.0,954.0,145.0,310.0,41.0,117.0,73.0,99.0,63.0,91.0,...,78.0,36.0,12.0,399.0,1991.0,203.0,229.0,104.0,2.0,22.0
max,85.0,3485.0,978.0,2173.0,402.0,1028.0,756.0,972.0,464.0,371.0,...,925.0,225.0,307.0,2832.0,2004.0,229.0,375.0,170.0,7.0,30.0


In [123]:
# visualize column names and types
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53949 entries, 0 to 53948
Data columns (total 34 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   League       53949 non-null  object 
 1   Season       53949 non-null  object 
 2   Stage        53949 non-null  object 
 3   Player       53949 non-null  object 
 4   Team         53938 non-null  object 
 5   GP           53949 non-null  int64  
 6   MIN          53949 non-null  float64
 7   FGM          53949 non-null  int64  
 8   FGA          53949 non-null  int64  
 9   3PM          53949 non-null  int64  
 10  3PA          53949 non-null  int64  
 11  FTM          53949 non-null  int64  
 12  FTA          53949 non-null  int64  
 13  TOV          53949 non-null  int64  
 14  PF           53949 non-null  int64  
 15  ORB          53949 non-null  int64  
 16  DRB          53949 non-null  int64  
 17  REB          53949 non-null  int64  
 18  AST          53949 non-null  int64  
 19  STL 

### Selection

In [124]:
# Column Selection

# --> to SERIES
df.Player  # or df["Player"]

# --> to DATAFRAME
df[["Player", "League"]]

Unnamed: 0,Player,League
0,Shaquille O'Neal,NBA
1,Vince Carter,NBA
2,Karl Malone,NBA
3,Allen Iverson,NBA
4,Gary Payton,NBA
...,...,...
53944,Kyrylo Meshheryakov,Ukrainian-Superleague
53945,Yaroslav Kadygrob,Ukrainian-Superleague
53946,Ernesto Tkachuk,Ukrainian-Superleague
53947,Andrij Shapovalov,Ukrainian-Superleague


In [125]:
# INDEX_BASED - Rows and Columns Selection
""" 
df.iloc[where_i,where_j]
where_i is the row number (: otherwise), where_j is the column number (can be omitted).
"""
df.iloc[:1]  # first row, all columns (same as df.iloc[:1, :])
df.iloc[:, :1]  # all rows, first column
df.iloc[:, -1]  # all rows, last column
df.iloc[:, :-1]  # all rows, all columns except last one
df.iloc[:, [0, 1]]  # all rows, columns 0 and 1
_ = df.iloc[:, 0:3]  # all rows, columns 0,1,2

In [126]:
# LABEL_BASED - Rows and Columns Selection
""" 
df.loc[lab_i,lab_j]
lab_i is the row index (: otherwise), lab_j is the row label (can be omitted)
"""

df.loc[:, :"League"]  # all rows, column "League"
df.loc[[1, 5000], :"League"]  # row 1 and 5000, column "League"

Unnamed: 0,League
1,NBA
5000,Euroleague


In [127]:
# CONDITION BASED - Rows Selection
"""
df.loc[condition array]
"""

df.loc[df.League == "Euroleague"]

_ = df.query(
    'Team in ["LAL", "TOR"]'
)  # all columns, but only rows with value "LAL" or "TOR" in column "Team"
# or df[df["Team"].isin(["LAL", "TOR"])]

---
### Data Cleaning

##### NaN values

In [128]:
# Drop rows containing a missing value
df.dropna()

# Fill in missing data with some value or use an interpolation method
_ = df.ffill()

##### Missing data

In [129]:
df.infer_objects(copy=False)
_ = df.interpolate(
    method="linear"
)  # fill-in missing data with some value (only numerical data)

  _ = df.interpolate(


##### Duplicate values

In [130]:
# Returns a boolean Series indicating whether each row is a duplicate
df.duplicated()

# Returns a DataFrame where the duplicated array is false
_ = df.drop_duplicates()

##### Replacing data

In [131]:
_ = df.replace([1, 2], [10, 20])  # replace 1 with 10, and 2 with 20

---
### Data Aggregation

In [132]:
# Split dataframe into multiple sets by values of single Column, then select a column, then apply the operation to single set
_ = df.groupby("League")["STL"].mean()

---
### Data Concatenation

In [133]:
# Create new columns in a DataFrame
# the index of the new column is not considered: you paste the new column directly in the dataframe as the last column
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["X", "Y", "Z"])
display(df)

column = np.array([11, 22, 33])
df_new = df.assign(TAG=column)

display(df_new)

Unnamed: 0,A,B
X,1,4
Y,2,5
Z,3,6


Unnamed: 0,A,B,TAG
X,1,4,11
Y,2,5,22
Z,3,6,33


---
# Exploring data

In [17]:
import pandas as pd


def print_structured_dataframe(df):
    print(f"# Rows: {df.shape[0]}")
    print(f"# Rows dropna(): {df.dropna().shape[0]}")
    print(f"# Rows duplicated(): {df[df.duplicated()].shape[0]}")
    print(f"# Cols: {df.shape[1]}")

    structured_data = pd.DataFrame(
        columns=["Column", "D-Type", "Sample", "# Distinct values", "# Null values"]
    )

    for i, col in enumerate(df.columns):
        data_type = df[col].dtype
        sample = df[col][0]
        unique_values = df[col].nunique()
        null_values = df[col].isnull().sum()

        structured_data.loc[i] = [col, data_type, sample, unique_values, null_values]

    display(structured_data)


# Example
df = pd.read_csv("datasets/nba.csv")
print_structured_dataframe(df)

# Rows: 53949
# Rows dropna(): 7836
# Rows duplicated(): 0
# Cols: 34


Unnamed: 0,Column,D-Type,Sample,# Distinct values,# Null values
0,League,object,NBA,49,0
1,Season,object,1999 - 2000,21,0
2,Stage,object,Regular_Season,3,0
3,Player,object,Shaquille O'Neal,14582,0
4,Team,object,LAL,793,11
5,GP,int64,79,85,0
6,MIN,float64,3163.0,16044,0
7,FGM,int64,956,755,0
8,FGA,int64,1665,1445,0
9,3PM,int64,0,257,0


In [31]:
import pandas as pd


def print_value_distribution(df, column_name):
    value_counts = df[column_name].value_counts().reset_index()
    value_counts.columns = ["Value", "#"]

    display(value_counts.style.set_caption(f'Distribution of "{column_name}"'))


# Example
df = pd.read_csv("datasets/iris.data.txt")
print_value_distribution(df, "Class")

Unnamed: 0,Value,#
0,Iris-setosa,50
1,Iris-versicolor,50
2,Iris-virginica,50
