# Import software libraries and load the dataset

In [None]:
import sys
import numpy as np
import pandas as pd

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))

# Load the dataset.
stores_df = pd.read_csv('/home/student/DSTIP/pandas/data/stores_data_more.csv',
                        index_col = 0)
print('\nLoaded dataset.')

# Index `DataFrame` columns

In [None]:
print('DataFrame shape: {}'.format(stores_df.shape))
stores_df.head()

In [None]:
stores_df['UnitPrice'].head()

In [None]:
ind_prices = ['UnitPrice', 'Quantity', 'Tax', 'TotalPrice']
stores_df[ind_prices].head()

# Index `DataFrame` rows and columns

In [None]:
stores_df.loc['CAR-HML-032']

In [None]:
stores_df.loc[['CAR-HML-032', 'CAR-HML-033']]

In [None]:
stores_df.loc[['CAR-HML-032', 'CAR-HML-033'], ['UnitPrice', 'Quantity']]

In [None]:
stores_df.loc['CAR-HML-032', 'Revenue']

# Reindex the `DataFrame`

In [None]:
new_index = ['Date', 'City', 'CustomerType', 'Gender',
             'ProductLine', 'UnitPrice', 'Quantity',
             'Tax', 'TotalPrice', 'Revenue', 'COGS']
stores_df = stores_df.reindex(new_index, axis = 1)
stores_df.head()

# Summarize statistics about the store data

In [None]:
stores_df.describe().round(2)

In [None]:
mode = int(stores_df['Quantity'].mode())
print('Most frequent item quantity: {}.'.format(mode))

In [None]:
categorical_cols = ['City', 'CustomerType', 'Gender', 'ProductLine']
stores_df[categorical_cols].describe()

# Retrieve information about the highest revenue purchase

In [None]:
high_inv = stores_df['Revenue'].idxmax()
high_row = stores_df.loc[high_inv, stores_df.columns]

print('Invoice {} on {} led to the highest revenue: ${:.2f}.' \
      .format(high_inv, high_row['Date'], high_row['Revenue']))
print('{} {} items were purchased.' \
      .format(int(high_row['Quantity']), high_row['ProductLine']))

# Identify correlations between the numeric variables

In [None]:
stores_df.corr().round(2)

# Identify missing values

In [None]:
stores_df.isna().head()

In [None]:
stores_df.isna().any()

In [None]:
stores_df.isna().sum()