# Descriptive analysis

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets

In [111]:
class DataDescriber:
    def __init__(self, filepath):
        try:
            self.df = pd.read_csv(filepath)
            self.df.set_index(self.df.columns[0], drop = True, inplace = True)
        except Exception as e:
            print('File could not be read')
            print(f'Error: {e}')

    def no_records(self):
        n = len(self.df)
        print(f'The data set has {n} rows')

    def col_types(self):
        for i in self.df.columns:
            dtype = type(self.df[i][0])
            print(f'Column {i} is of data type {dtype}')
        
    def unique_values(self):
        cols = [x for x in list(self.df.columns) if x not in ['Record_Number', 'Region']]
        for i in cols:
            vals = self.df[i].unique()
            vals.sort()
            print(f'Column {i} takes values {vals}')

    def _grouped_no_records(self, col1, col2, col1_vals = 'None', col2_vals = 'None', summary_stats = 'None', proportional = 'Count'):
        grouped_mi = self.df.groupby([col1, col2])['Record_Number'].count()
        grouped_df = pd.DataFrame(grouped_mi)
        # If specified, only select values parsed to function
        if col1_vals != 'None':
            grouped_df = grouped_df[grouped_df.index.get_level_values(0).isin(col1_vals)] 
        if col2_vals != 'None':
            grouped_df = grouped_df[grouped_df.index.get_level_values(1).isin(col2_vals)]
        # If specified, group all values of specified column
        if summary_stats == col1:
            grouped_df.index = grouped_df.index.droplevel(col2)
            grouped_df = grouped_df.groupby(col2).sum()
            if proportional == 'Proportion':
                grouped_df = grouped_df.div(grouped_df.sum().sum())
                grouped_df.rename(columns = {'Record_Number': 'Proportion of Records'}, inplace = True)
                fmt = '.2f'
            else:
                grouped_df.rename(columns = {'Record_Number': 'Number of Records'}, inplace = True)
                fmt = 'd'
            plt.figure(figsize=(14, 10))
            sns.heatmap(grouped_df, annot=True, cmap="YlGnBu", fmt=fmt, cbar=True, square = True, cbar_kws={'shrink': 0.5},
                    linecolor='gray', linewidth=0.2)
            return
        if summary_stats == col2:
            grouped_df.index = grouped_df.index.droplevel(col1)
            grouped_df = grouped_df.groupby(col1).sum()
            if proportional == 'Proportion':
                grouped_df = grouped_df.div(grouped_df.sum().sum())
                grouped_df.rename(columns = {'Record_Number': 'Proportion of Records'}, inplace = True)
                fmt = '.2f'
            else:
                grouped_df.rename(columns = {'Record_Number': 'Number of Records'}, inplace = True)
                fmt = 'd'
            plt.figure(figsize=(14, 10))
            sns.heatmap(grouped_df, annot=True, cmap="YlGnBu", fmt=fmt, cbar=True, square = True, cbar_kws={'shrink': 0.5},
                        linecolor='gray', linewidth=0.2)
            return
        grouped_df = grouped_df.unstack().fillna(0).astype(int)
        grouped_df.columns = grouped_df.columns.droplevel()
        if 'X' in grouped_df.columns:
            sorted_cols = [str(x) for x in sorted([int(x) for x in grouped_df.columns if x != 'X'])]
            sorted_cols.append('X')
        else:
            sorted_cols = sorted(grouped_df.columns)
        grouped_df = grouped_df.reindex(sorted_cols, axis = 1)
        fmt = 'd'

        if proportional == 'Proportion':
            grouped_df = grouped_df.div(grouped_df.sum().sum())
            fmt = '.2f'
        
        # Plot heatmap
        plt.figure(figsize=(14, 10))
        sns.heatmap(grouped_df, annot=True, cmap="YlGnBu", fmt=fmt, cbar=True, square = True, cbar_kws={'shrink': 0.5},
                    linecolor='gray', linewidth=0.2)
        plt.show()
        
    def _update_col1_values(self, col1):
        unique_values = sorted(self.df[col1].unique())
        self.col1_vals.options = unique_values
        self.col1_vals.value = unique_values
        self.col1_vals.description = f'{col1}: '
        self.dropdown4.options = ['None', col1, self.dropdown2.value]
    
    def _update_col2_values(self, col2):
        unique_values = sorted(self.df[col2].unique())
        self.col2_vals.options = unique_values
        self.col2_vals.value = unique_values
        self.col2_vals.description = f'{col2}: '
        self.dropdown4.options = ['None', self.dropdown2.value, col2]

        
    def group_data(self):
        # Dropdown menu widget
        self.dropdown1 = widgets.Dropdown(options=self.df.columns.drop('Record_Number'), value='age', description='Factor: ')
        self.dropdown2 = widgets.Dropdown(options=self.df.columns.drop('Record_Number'), value='health', description='Factor: ')
        self.dropdown3 = widgets.Dropdown(options=['Count', 'Proportion'], value='Count', description='Factor: ')
        # Checkbox widgets for selecting column values
        style = {'description_width': 'initial'}
        col1_values = sorted(self.df[self.dropdown1.value].unique())
        self.col1_vals = widgets.SelectMultiple(options=col1_values, value=col1_values, description=f'{self.dropdown1.value}:', style = style)
        col2_values = sorted(self.df[self.dropdown2.value].unique())
        self.col2_vals = widgets.SelectMultiple(options=col2_values, value=col2_values, description=f'{self.dropdown2.value}:', style = style)

        self.dropdown4 = widgets.Dropdown(options = ['None', self.dropdown1.value, self.dropdown2.value], value = 'None', description = 'Summary stats: ', style = style)
        
        # Update checkbox values when dropdown values change
        self.dropdown1.observe(lambda change: self._update_col1_values(change.new), names='value')
        self.dropdown2.observe(lambda change: self._update_col2_values(change.new), names='value')
        
        # Interactive widget to update the plot based on dropdown selection
        interact(self._grouped_no_records, col1=self.dropdown1, col2 = self.dropdown2, col1_vals=self.col1_vals, 
                 col2_vals= self.col2_vals, summary_stats = self.dropdown4, proportional = self.dropdown3)


In [112]:
dd = DataDescriber('D:\\University\\Python for Data Analysis\\Repos\\PFDAAV\\data\\Scotland_teaching_file_1PCT_refined.csv')
#dd.grouped_no_records('Hours_Worked_Per_Week', 'industry', proportional=True)
# dd.grouped_no_records('Occupation', 'Approximate_Social_Grade')
# dd._grouped_no_records('age', 'Economic_Activity', col2_vals = ['1', '2', '3', '4'], col2_group = True)
# dd.grouped_no_records('health', 'Economic_Activity', col2_vals = ['5', '6', '7', '8', '9'], col2_group = True)
#dd._grouped_no_records('Hours_Worked_Per_Week', 'Economic_Activity', col2_vals = ['4', '6'])
dd.group_data()

interactive(children=(Dropdown(description='Factor: ', index=4, options=('Region', 'RESIDENCE_TYPE', 'Family_C…

<Figure size 1400x1000 with 0 Axes>

In [55]:
dd = DataDescriber('D:\\University\\Python for Data Analysis\\Repos\\PFDAAV\\data\\Scotland_teaching_file_1PCT_refined.csv')
dd.no_records()
dd.col_types()
dd.unique_values()

The data set has 63388 rows
Column Record_Number is of data type <class 'numpy.int64'>
Column Region is of data type <class 'str'>
Column RESIDENCE_TYPE is of data type <class 'str'>
Column Family_Composition is of data type <class 'str'>
Column sex is of data type <class 'numpy.int64'>
Column age is of data type <class 'numpy.int64'>
Column Marital_Status is of data type <class 'numpy.int64'>
Column student is of data type <class 'numpy.int64'>
Column Country_Of_Birth is of data type <class 'numpy.int64'>
Column health is of data type <class 'numpy.int64'>
Column Ethnic_Group is of data type <class 'numpy.int64'>
Column religion is of data type <class 'numpy.int64'>
Column Economic_Activity is of data type <class 'str'>
Column Occupation is of data type <class 'str'>
Column industry is of data type <class 'str'>
Column Hours_Worked_Per_Week is of data type <class 'str'>
Column Approximate_Social_Grade is of data type <class 'str'>
Column RESIDENCE_TYPE takes values ['C' 'P']
Column Fa