In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date, time
# import pandas_profiling

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

%config InlineBackend.figure_format='svg'
plt.rcParams['figure.dpi']=120

pd.options.display.float_format='{:,.2f}'.format
pd.set_option('display.max_colwidth', None)


In [None]:
df=pd.read_csv('./pandasData/house_price1.csv')
df.columns

<h3 style='color:blue'>1. Filter columns</h3>

- Only need a couple of columns from the dataset? Use `usecols`

In [None]:
pd.read_csv("./pandasData/TESLA.csv", usecols=["Date", "Open", "High", "Low"])

<h3 style='color:blue'>2. Parse dates on read</h3>

- No need to do `pd.to_datetime` anymore, parse it on read!

In [None]:
ts=pd.read_csv("./pandasData/TESLA.csv", parse_dates=["Date"])
ts.info()

<h3 style='color:blue'>3. Specify Data Types</h3>

- **Setting category data types at read can save a ton of memory for data frames!**

In [None]:
hp=pd.read_csv("./pandasData/house_price1.csv", dtype={"HouseStyle": "category"})


hp.HouseStyle.value_counts(normalize=True)

<h3 style='color:blue'>4. Set index</h3>

- Setting indexes are especially useful for time series data.

In [None]:
pd.read_csv("./pandasData/TESLA.csv", index_col="Date")

<h3 style='color:blue'>5. No. of rows to read</h3>

- Don’t want to read in a dataset with millions of `rows` before having a peek at it? Use nrows!

In [None]:
pd.read_csv("./pandasData/TESLA.csv", nrows=100)

<h3 style='color:blue'>6. Skip rows</h3>

- Does your data set have rows with faulty data? Skip them!

In [None]:
pd.read_csv("./pandasData/house_price1.csv", skiprows=[1, 5])  # skips line 1 and 5
pd.read_csv("./pandasData/house_price1.csv", skiprows=100)  # skips the first 100 lines
pd.read_csv("./pandasData/house_price1.csv", skiprows=lambda x: x > 0 and np.random.rand() > 0.1) # skip 90% of the rows

<h3 style='color:blue'>7. Specify NA values</h3>

- If your data has values that are supposed to be NA, i.e. values such as ? set it at read so you won’t have to convert it later.

In [None]:
pd.read_csv("./pandasData/house_price1.csv", na_values=["?"])

<h3 style='color:blue'>8. Setting boolean values</h3>

- Have a boolean column that’s in the form of Yes and No? Tell pandas about it!

In [None]:
pd.read_csv("./pandasData/house_price1.csv", true_values=["yes"], false_values=["no"])

<h3 style='color:blue'>10. Copy and Paste into Data Frames</h3>

- Looking at some data on Excel but don’t want to download it? Copy it! Pandas can read from your clipboard.

In [None]:
df = pd.read_clipboard() 

<h3 style='color:blue'>11. Read tables from PDF files</h3>

In [None]:
# %pip install tabula-py

from tabula import read_pdf
# Read pdf into list of DataFrame
df = read_pdf('test.pdf', pages='all')

<h2 style='color:blue'>Exploratory Data Analysis (EDA)</h2>

<h3 style='color:blue'>12. EDA cheat</h3>

- Want to visualize your dataset but don’t want to write code for plots? With pandas-profiling, you can do it with just one line of code.


In [None]:
# %pip install pandas-profiling

import pandas_profiling

df = pd.read_csv("./pandasData/house_price1.csv")
profile = df.profile_report(title="Pandas Profiling Report")
profile.to_file(output_file="output.html")

# <h3 style='color:blue'>Data Types (dtypes)</h3>

In [None]:
# selecting
df.select_dtypes(include="number")
df.select_dtypes(include=["category", "datetime"])

# exluding
df.select_dtypes(exclude="object")



<h3 style='color:blue'>15. Downcastings</h3>

- Pandas’ `to_numeric` has a nifty feature to downcast the type, allowing you to reduce the data frame’s size.

In [None]:
pd.to_numeric(df.numeric_col, downcast="integer") # smallest signed int dtype
pd.to_numeric(df.numeric_col, downcast="float")  # smallest float dtype

<h3 style='color:blue'>16. Manual conversions</h3>

- If there are NaN values in the data, errors="coerce" can help prevent those nasty errors. At the same time, you can fill those NA values with reasonable values using .fillna

In [None]:
# apply to whole data frame
df = df.apply(pd.to_numeric, errors="coerce")

# apply to specific columns
pd.to_numeric(df.numeric_column, errors="coerce")

# filling NA values with zero
pd.to_numeric(df.numeric_column, errors="coerce").fillna(0)

<h3 style='color:blue'>17. Convert all at once</h3>

In [None]:
df = df.astype(
    {
        "date": "datetime64[ns]",
        "price": "int",
        "is_weekend": "bool",
        "status": "category",
    }
)

<h2 style='color:blue'>Column operations</h2>

<h3 style='color:blue'>18. Renaming columns</h3>

In [None]:
df = df.rename({"PRICE": "price", "Date (mm/dd/yyyy)": "date"}, axis=1)

<h3 style='color:blue'>19. Add suffix and prefix</h3>

In [None]:
df.add_prefix("pre_")
df.add_suffix("_suf")

<h3 style='color:blue'>21. Insert columns at specific positions
</h3>

In [None]:
random_col = np.random.randint(10, size=len(df))
df.insert(3, 'random_col', random_col) # inserts at third column

<h3 style='color:blue'>22. if-then-else</h3>

In [None]:
df["logic"] = np.where(df["price"] > 5, "high", "low")

<h3 style='color:blue'>23. Dropping columns</h3>

In [None]:
df.drop('col1', axis=1, inplace=True)
df = df.drop(['col1','col2'], axis=1)
s = df.pop('col')
del df['col']
df.drop(df.columns[0], inplace=True)

<h2 style='color:blue'>String operations</h2>

<h3 style='color:blue'>24. Column names</h3>

In [None]:
# on column names
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')

<h3 style='color:blue'>24. Contains</h3>

In [None]:
df['name'].str.contains("John")

df['phone_num'].str.contains('...-...-....', regex=True)  # regex

df['email'].str.contains('gmail')

<h3 style='color:blue'>25. findall</h3>

In [None]:
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

df['email'].str.findall(pattern, flags=re.IGNORECASE)

<h2 style='color:blue'>Missing values</h2>

<h3 style='color:blue'>26. Checking</h3>

In [None]:
def missing_vals(df):
    """prints out columns with perc of missing values"""
    missing = [
        (df.columns[idx], perc)
        for idx, perc in enumerate(df.isna().mean() * 100)
        if perc > 0
    ]

    if len(missing) == 0:
        return "no missing values"
        

    # sort desc by perc
    missing.sort(key=lambda x: x[1], reverse=True)

    print(f"There are a total of {len(missing)} variables with missing values\n")

    for tup in missing:
        print(str.ljust(f"{tup[0]:<20} => {round(tup[1], 3)}%", 1))


missing_vals(df)

<h3 style='color:blue'>27. Dealing with missing values</h3>

In [None]:
# drop 
df.dropna(axis=0)
df.dropna(axis=1)

# impute
df.fillna(0)
df.fillna(method="ffill")
df.fillna(method='bfill')

# replace
df.replace( -999, np.nan)
df.replace("?", np.nan)

# interpolate
ts.interpolate() # time series
df.interpolate() # fill all consecutive values forward
df.interpolate(limit=1) # fill one consecutive value forward
df.interpolate(limit=1, limit_direction="backward")
df.interpolate(limit_direction="both")

<h3 style='color:blue'>Date operations</h3>

In [None]:
df = pd.read_csv("TESLA.csv", usecols=["Date", "Open", "Close", "Volume"], parse_dates=['Date'])
# https://www.kaggle.com/datasets/rpaguirre/tesla-stock-price

df.head()

In [None]:
# from today
date.today() + datetime.timedelta(hours=30)
date.today() + datetime.timedelta(days=30)
date.today() + datetime.timedelta(weeks=30)

# ago
date.today() - datetime.timedelta(days=365)

<h3 style='color:blue'>29. Filter between two dates</h3>

In [None]:
df[(df["Date"] > "2015-01-01") & (df["Date"] < "2017-01-01")]

<h3 style='color:blue'>30. Filter by day/month/year</h3>

In [None]:
# filter by single day
df[df["Date"].dt.strftime("%Y-%m-%d") == "2017-03-01"]

# filter by single month
df[df["Date"].dt.strftime("%m") == "12"]

# filter by single year
df[df["Date"].dt.strftime("%Y") == "2017"]

<h2 style='color:blue'>Styling data frames</h2>

<a href="https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html"> Source</a>

<h3 style='color:blue'>31. Number format</h3>

In [None]:
format_dict = {
    "Date": "{:%d/%m/%y}",
    "Open": "${:.2f}",
    "Close": "${:.2f}",
    "Volume": "{:,}",
}

df.style.format(format_dict)

<h3 style='color:blue'>32. Let there be colors</h3>

In [None]:
(
    df.style.format(format_dict)
    .hide(axis='index')
    .highlight_min(["Open"], color="red")
    .highlight_max(["Open"], color="green")
    .background_gradient(subset="Close", cmap="Blues")
    .bar('Volume', color='lightblue', align='zero')
    .set_caption('Tesla Stock Prices in 2017')
)

<h2 style='color:blue'>Misc</h2>

<h3 style='color:blue'>33. Get the id of max and min in a column</h3>

In [None]:
df['Open'].idxmin()
df['Close'].idxmax()

<h3 style='color:blue'>34. Apply function to data frames</h3>

In [None]:
df.applymap(lambda x: np.log(x))

<h3 style='color:blue'>35. Randomly shuffle data</h3>

In [None]:
df.sample(frac=1, random_state=7).reset_index(drop=True)

<h3 style='color:blue'>36. Percent change</h3>

- Useful for time series data

ex: price of BTC over 3 days [30000, 33000, 31000] -> [NaN, 0.1, -0.06]



In [None]:
df['col_name'].pct_change()

<h3 style='color:blue'>37. Assign rank</h3>

In [None]:
df['rank'] = df['column_to_rank'].rank()

<h3 style='color:blue'>38. Check memory usage of data frame</h3>

In [None]:
df.memory_usage().sum() / (1024**2) #converting to MB

<h3 style='color:blue'>39. Explode list values to multiple rows</h3>

In [None]:
df.explode("col_name").reset_index(drop=True)

<h3 style='color:blue'>40. Convert smaller categories to “Others”</h3>

In [None]:
df = pd.read_csv("house_price.csv", nrows=100)

df.columns

In [None]:
subclass = df.MSSubClass
subclass.value_counts()

In [None]:
top_five = subclass.value_counts().nlargest(5).index
top_five
mssubclass_new = subclass.where(subclass.isin(top_five), other="Other")
mssubclass_new.value_counts()
