In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# Superstore analysis

In [3]:
# load superstore
superstore = pd.read_csv("./datasets/SampleSuperstore.csv")
credit_risk = pd.read_csv("./datasets/credit_risk_dataset.csv")
mushrooms = pd.read_csv("./datasets/mushrooms.csv")

In [12]:
print(superstore.shape)
print(credit_risk.shape)
print(mushrooms.shape)

(9994, 13)
(32581, 12)
(8124, 23)


In [14]:
credit_risk.loan_status.value_counts(1)

loan_status
0    0.781836
1    0.218164
Name: proportion, dtype: float64

In [16]:
mushrooms["class"].value_counts(1)

class
e    0.517971
p    0.482029
Name: proportion, dtype: float64

In [13]:
credit_risk.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

In [8]:
def table_metadata_to_latex(frame, caption):
    
    # create frame with column names and data types
    metadata = pd.DataFrame(frame.dtypes, columns=["Data Type"])

    # add column for number of unique values
    metadata["Unique Values"] = frame.nunique()

    # add column for number of missing values
    metadata["Missing Values"] = frame.isnull().sum()

    # escape underscores in column names
    metadata.index = metadata.index.str.replace("_", "\_")

    print(metadata.to_latex(caption=caption))

In [9]:
table_metadata_to_latex(superstore, caption="Superstore Metadata")
table_metadata_to_latex(credit_risk, caption="Credit Risk Metadata")
table_metadata_to_latex(mushrooms, caption="Mushrooms Metadata")

\begin{table}
\caption{Superstore Metadata}
\begin{tabular}{llrr}
\toprule
 & Data Type & Unique Values & Missing Values \\
\midrule
Ship Mode & object & 4 & 0 \\
Segment & object & 3 & 0 \\
Country & object & 1 & 0 \\
City & object & 531 & 0 \\
State & object & 49 & 0 \\
Postal Code & int64 & 631 & 0 \\
Region & object & 4 & 0 \\
Category & object & 3 & 0 \\
Sub-Category & object & 17 & 0 \\
Sales & float64 & 5825 & 0 \\
Quantity & int64 & 14 & 0 \\
Discount & float64 & 12 & 0 \\
Profit & float64 & 7287 & 0 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}
\caption{Credit Risk Metadata}
\begin{tabular}{llrr}
\toprule
 & Data Type & Unique Values & Missing Values \\
\midrule
person\_age & int64 & 58 & 0 \\
person\_income & int64 & 4295 & 0 \\
person\_home\_ownership & object & 4 & 0 \\
person\_emp\_length & float64 & 36 & 895 \\
loan\_intent & object & 6 & 0 \\
loan\_grade & object & 7 & 0 \\
loan\_amnt & int64 & 753 & 0 \\
loan\_int\_rate & float64 & 348 & 3116 \\
loan\_status &

In [14]:
# get amount of unique features in frame
def get_unique_features(frame):
    unique_features = []
    for col in frame.columns:
        unique_features.append([col, len(frame[col].unique())])
    return unique_features

In [12]:
# get amount of null values in frame
superstore.isnull().sum()

Ship Mode       0
Segment         0
Country         0
City            0
State           0
Postal Code     0
Region          0
Category        0
Sub-Category    0
Sales           0
Quantity        0
Discount        0
Profit          0
dtype: int64

In [15]:
get_unique_features(superstore)

[['Ship Mode', 4],
 ['Segment', 3],
 ['Country', 1],
 ['City', 531],
 ['State', 49],
 ['Postal Code', 631],
 ['Region', 4],
 ['Category', 3],
 ['Sub-Category', 17],
 ['Sales', 5825],
 ['Quantity', 14],
 ['Discount', 12],
 ['Profit', 7287]]

In [34]:
credit_risk = pd.read_csv("./datasets/credit_risk_dataset.csv")

In [36]:
credit_risk.shape

(32581, 12)

In [35]:
credit_risk.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64